[Intel-xe] [PATCH 2/2] drm/xe: Make bind engines safe

Mon Jul 3 15:53:35 UTC 2023

On Sun, 2023-07-02 at 15:17 -0700, Matthew Brost wrote:
> We currently have a race between bind engines which can result in
> corrupted page tables leading to faults.
> 
> A simple example:
> bind A 0x0000-0x1000, engine A, has unsatisfied in-fence
> bind B 0x1000-0x2000, engine B, no in-fences
> exec A uses 0x1000-0x2000
> 

Don't we have an unbind race as well?
exec A uses 0x1000-0x2000
unbind A unbinds 0x1000-0x2000 behind an exec in-fence.
unbind B unbinds 0x0000-0x1000

Boom.

> Bind B will pass bind A and exec A will fault. This occurs as bind A
> programs the root of the page table in a bind job which is held up by
> an
> in-fence. Bind B in this case just programs a leaf entry of the
> structure.
> 
> To fix this introduce a per VM maple tree to track cross engine
> conflicts. In the above example bind A would insert an dependency in
> the
> maple tree with a key of 0x0-0x7fffffffff, bind B would find that
> dependency and its bind job would scheduled behind the unsatisfied
> in-fence and bind A's job.
> 
> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_migrate.c         | 76
> +++++++++++++++++++++++--
>  drivers/gpu/drm/xe/xe_sched_job_types.h |  7 +++
>  drivers/gpu/drm/xe/xe_vm.c              | 18 ++++++
>  drivers/gpu/drm/xe/xe_vm_types.h        |  6 ++
>  4 files changed, 103 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
> b/drivers/gpu/drm/xe/xe_migrate.c
> index b2fa2cb5f537..efcbb3909b2e 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -1109,6 +1109,14 @@ static bool no_in_syncs(struct xe_sync_entry
> *syncs, u32 num_syncs)
>         return true;
>  }
>  
> +static void remove_dep_job(struct maple_tree *mtree,
> +                          struct xe_sched_job *dep_job)
> +{
> +       mtree_erase(mtree, dep_job->bind_args.index);
> +       dma_fence_put(dep_job->fence);
> +       xe_sched_job_put(dep_job);
> +}
> +
>  /**
>   * xe_migrate_update_pgtables() - Pipelined page-table update
>   * @m: The migrate context.
> @@ -1150,8 +1158,8 @@ xe_migrate_update_pgtables(struct xe_migrate
> *m,
>         struct xe_tile *tile = m->tile;
>         struct xe_gt *gt = tile->primary_gt;
>         struct xe_device *xe = tile_to_xe(tile);
> -       struct xe_sched_job *job;
> -       struct dma_fence *fence;
> +       struct xe_sched_job *job, *dep_job;
> +       struct dma_fence *fence, *dep_fence;
>         struct drm_suballoc *sa_bo = NULL;
>         struct xe_vma *vma = pt_update->vma;
>         struct xe_bb *bb;
> @@ -1161,9 +1169,31 @@ xe_migrate_update_pgtables(struct xe_migrate
> *m,
>         bool usm = !eng && xe->info.supports_usm;
>         bool first_munmap_rebind = vma && vma->first_munmap_rebind;
>         struct xe_engine *eng_override = !eng ? m->eng : eng;
> +       int level = 0;
> +       bool found_dep = false;
> +       unsigned long index, last;
> +
> +       /* Find inter-engine deps */
> +       for (i = 0; i < num_updates; i++) {
> +               const struct xe_vm_pgtable_update *update =
> &updates[i];
> +
> +               if (update->pt->level > level)
> +                       level = update->pt->level;
> +       }

This looks sub-optimal. What if index should be aligned down to level 2
and last aligned up to level 1?

> +       index = ALIGN_DOWN(xe_vma_start(vma), 0x1ull <<

Doesn't build here.

> xe_pt_shift(level));
> +       last = ALIGN(xe_vma_end(vma), 0x1ull << xe_pt_shift(level)) -
> 1;
> +       mt_for_each(&vm->mtree[tile->id], dep_job, index, last) {

Did this really pass the migrate selftest?

> +               dep_fence = dep_job->fence;
> +
> +               if (dma_fence_is_signaled(dep_fence))
> +                       remove_dep_job(&vm->mtree[tile->id],
> dep_job);
> +               else if (eng_override != dep_job->engine)
> +                       found_dep = true;
> +       }
>  
>         /* Use the CPU if no in syncs and engine is idle */
> -       if (no_in_syncs(syncs, num_syncs) &&
> xe_engine_is_idle(eng_override)) {
> +       if (no_in_syncs(syncs, num_syncs) &&
> xe_engine_is_idle(eng_override) &&
> +           !found_dep) {
>                 fence =  xe_migrate_update_pgtables_cpu(m, vm, bo,
> updates,
>                                                         num_updates,
>                                                         first_munmap_
> rebind,
> @@ -1293,11 +1323,47 @@ xe_migrate_update_pgtables(struct xe_migrate
> *m,
>         if (err)
>                 goto err_job;
>  
> +       /* Add inter-engine dep */
> +       index = ALIGN_DOWN(xe_vma_start(vma), 0x1ull <<
> xe_pt_shift(level));
> +       last = ALIGN(xe_vma_end(vma), 0x1ull << xe_pt_shift(level)) -
> 1;
> +       job->bind_args.index = index;
> +       job->bind_args.last = last;
> +       mt_for_each(&vm->mtree[tile->id], dep_job, index, last) {
> +               dep_fence = dep_job->fence;
> +
> +               if (dma_fence_is_signaled(dep_fence)) {
> +                       remove_dep_job(&vm->mtree[tile->id],
> dep_job);
> +               } else {
> +                       if (eng_override != dep_job->engine) {
> +                               dma_fence_get(dep_fence);
> +                               err =
> drm_sched_job_add_dependency(&job->drm,
> +                                                                 
> dep_fence);
> +                               if (err)
> +                                       goto err_job;
> +                       }
> +
> +                       if (dep_job->bind_args.index < job-
> >bind_args.index)
> +                               job->bind_args.index = dep_job-
> >bind_args.index;
> +                       if (dep_job->bind_args.last > job-
> >bind_args.last)
> +                               job->bind_args.last = dep_job-
> >bind_args.last;
> +

This makes it possible to use an mtree but may carry unneeded
dependencies forward long after the bind in question have signaled.
Why not use an interval tree? Sure, you'd need to instantiate a new one
to get the needed 64-bit VA ranges, but you could then separate it out
as a utility with a range and a fence to avoid open-coding it all here
and add vm-specific fields to the sched job.

> +                       remove_dep_job(&vm->mtree[tile->id],
> dep_job);
> +               }
> +       }
> +       err = mtree_insert_range(&vm->mtree[tile->id], job-
> >bind_args.index,
> +                                job->bind_args.last, job,
> GFP_KERNEL);
> +       if (err)
> +               goto err_job;
> +
> +       dma_fence_get(job->fence);
> +       xe_sched_job_get(job);
> +
>         if (ops->pre_commit) {
>                 err = ops->pre_commit(pt_update);
>                 if (err)
> -                       goto err_job;
> +                       goto err_mtree;
>         }
> +
>         xe_sched_job_arm(job);
>         fence = dma_fence_get(&job->drm.s_fence->finished);
>         xe_sched_job_push(job);
> @@ -1310,6 +1376,8 @@ xe_migrate_update_pgtables(struct xe_migrate
> *m,
>  
>         return fence;
>  
> +err_mtree:
> +       remove_dep_job(&vm->mtree[tile->id], job);
>  err_job:
>         xe_sched_job_put(job);
>  err_bb:
> diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h
> b/drivers/gpu/drm/xe/xe_sched_job_types.h
> index 5534bfacaa16..246313926e05 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job_types.h
> +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
> @@ -37,6 +37,13 @@ struct xe_sched_job {
>                 /** @value: write back value */
>                 u64 value;
>         } user_fence;
> +       /** @bind_args: if a bind job, the bind arguments */
> +       struct {
> +               /** @index: index of job a bind job stored in a mtree
> */
> +               unsigned long index;
> +               /** @last: last of job a bind job stored in a mtree
> */
> +               unsigned long last;
> +       } bind_args;

What lock protects these?

>         /** @migrate_flush_flags: Additional flush flags for
> migration jobs */
>         u32 migrate_flush_flags;
>         /** @batch_addr: batch buffer address of job */
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index dbff75a9087b..711d79e5fd3e 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1233,6 +1233,8 @@ struct xe_vm *xe_vm_create(struct xe_device
> *xe, u32 flags)
>                     tile->id != XE_VM_FLAG_GT_ID(flags))
>                         continue;
>  
> +               mt_init(&vm->mtree[id]);
> +
>                 vm->pt_root[id] = xe_pt_create(vm, tile, xe-
> >info.vm_max_level);
>                 if (IS_ERR(vm->pt_root[id])) {
>                         err = PTR_ERR(vm->pt_root[id]);
> @@ -1402,6 +1404,21 @@ void xe_vm_close_and_put(struct xe_vm *vm)
>         }
>  
>         down_write(&vm->lock);
> +
> +       for_each_tile(tile, xe, id) {
> +               struct xe_sched_job *job;
> +               unsigned long index = 0, max = ULONG_MAX;
> +
> +               if (!vm->pt_root[id])
> +                       continue;
> +
> +               mt_for_each(&vm->mtree[id], job, index, max) {
> +                       mtree_erase(&vm->mtree[tile->id], job-
> >bind_args.index);
> +                       dma_fence_put(job->fence);
> +                       xe_sched_job_put(job);
> +               }
> +       }
> +
>         xe_vm_lock(vm, &ww, 0, false);
>         while (vm->vmas.rb_node) {
>                 struct xe_vma *vma = to_xe_vma(vm->vmas.rb_node);
> @@ -1507,6 +1524,7 @@ static void vm_destroy_work_func(struct
> work_struct *w)
>         xe_vm_lock(vm, &ww, 0, false);
>         for_each_tile(tile, xe, id) {
>                 if (vm->pt_root[id]) {
> +                       mtree_destroy(&vm->mtree[id]);
>                         xe_pt_destroy(vm->pt_root[id], vm->flags,
> NULL);
>                         vm->pt_root[id] = NULL;
>                 }
> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
> b/drivers/gpu/drm/xe/xe_vm_types.h
> index c148dd49a6ca..e313ecfa1aa0 100644
> --- a/drivers/gpu/drm/xe/xe_vm_types.h
> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> @@ -8,6 +8,7 @@
>  
>  #include <linux/dma-resv.h>
>  #include <linux/kref.h>
> +#include <linux/maple_tree.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/scatterlist.h>
>  
> @@ -160,6 +161,11 @@ struct xe_vm {
>  
>         struct kref refcount;
>  
> +       /**
> +        * @mtree: the &maple_tree to track of conflicts between bind
> engines
> +        */
> +       struct maple_tree mtree[XE_MAX_TILES_PER_DEVICE];
> +

And similarly what protects the mtree? Please document.

>         /* engine used for (un)binding vma's */
>         struct xe_engine *eng[XE_MAX_TILES_PER_DEVICE];
>  

As a general question, why do you choose to bleed dependency tracking
that should be known only to the vm code into the migrate code and the
sched job code and then open-code it all in the migrate code? That 
xe_migrate_update_pgtables() was already hard to follow, that's why I
used pre_commit() for separation in the first patch, passing
xe_sched_job around for dependency adding only.

Also I think it should be beneficial to separate out the range-fence
tracking to something like the below to avoid leaking vm stuff into
xe_sched_job.

/Thomas