[Intel-xe] [PATCH 2/3] drm/xe: Ban a VM if rebind worker hits an error

Matthew Brost matthew.brost at intel.com
Mon Jun 12 15:47:47 UTC 2023


On Fri, Jun 09, 2023 at 11:10:15PM +0200, Thomas Hellström wrote:
> On Fri, 2023-06-09 at 11:26 -0700, Matthew Brost wrote:
> > We cannot recover a VM if a rebind worker hits an error, ban the VM
> > if
> > happens to ensure we do not attempt to place this VM on the hardware
> > again.
> > 
> > A follow up will inform the user if this happens.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_engine.c     |  5 +++++
> >  drivers/gpu/drm/xe/xe_exec.c       |  2 +-
> >  drivers/gpu/drm/xe/xe_trace.h      |  5 +++++
> >  drivers/gpu/drm/xe/xe_vm.c         | 27 +++++++++++++++++++++++----
> >  drivers/gpu/drm/xe/xe_vm.h         | 10 ++++++++++
> >  drivers/gpu/drm/xe/xe_vm_madvise.c |  2 +-
> >  drivers/gpu/drm/xe/xe_vm_types.h   |  5 +++--
> >  7 files changed, 48 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_engine.c
> > b/drivers/gpu/drm/xe/xe_engine.c
> > index b3036c4a8ec3..b1f808c08dc5 100644
> > --- a/drivers/gpu/drm/xe/xe_engine.c
> > +++ b/drivers/gpu/drm/xe/xe_engine.c
> > @@ -597,6 +597,11 @@ int xe_engine_create_ioctl(struct drm_device
> > *dev, void *data,
> >                 if (XE_IOCTL_ERR(xe, !vm))
> >                         return -ENOENT;
> >  
> > +               if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm)))
> > {
> > +                       xe_vm_put(vm);
> > +                       return -ENOENT;
> > +               }
> 
> Hm. What if the VM gets banned exactly here? Shouldn't the banned and
> closed status be read inside a vm lock? (which must then protect the
> entire vm->flags member)
> 

It doesn't really matter, worst case we create an engine, we won't be
able to submit to it as the exec has the same check. I can move if you
think it is a big deal IMO it is not.

Matt 

> /Thomas
> 
> 
> > +
> >                 e = xe_engine_create(xe, vm, logical_mask,
> >                                      args->width, hwe,
> > ENGINE_FLAG_PERSISTENT);
> >                 xe_vm_put(vm);
> > diff --git a/drivers/gpu/drm/xe/xe_exec.c
> > b/drivers/gpu/drm/xe/xe_exec.c
> > index e44076ee2e11..27c11812d733 100644
> > --- a/drivers/gpu/drm/xe/xe_exec.c
> > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > @@ -294,7 +294,7 @@ int xe_exec_ioctl(struct drm_device *dev, void
> > *data, struct drm_file *file)
> >         if (err)
> >                 goto err_unlock_list;
> >  
> > -       if (xe_vm_is_closed(engine->vm)) {
> > +       if (xe_vm_is_closed_or_banned(engine->vm)) {
> >                 drm_warn(&xe->drm, "Trying to schedule after vm is
> > closed\n");
> >                 err = -EIO;
> >                 goto err_engine_end;
> > diff --git a/drivers/gpu/drm/xe/xe_trace.h
> > b/drivers/gpu/drm/xe/xe_trace.h
> > index 2f8eb7ebe9a7..eb89ac934394 100644
> > --- a/drivers/gpu/drm/xe/xe_trace.h
> > +++ b/drivers/gpu/drm/xe/xe_trace.h
> > @@ -472,6 +472,11 @@ DECLARE_EVENT_CLASS(xe_vm,
> >                               __entry->asid)
> >  );
> >  
> > +DEFINE_EVENT(xe_vm, xe_vm_kill,
> > +            TP_PROTO(struct xe_vm *vm),
> > +            TP_ARGS(vm)
> > +);
> > +
> >  DEFINE_EVENT(xe_vm, xe_vm_create,
> >              TP_PROTO(struct xe_vm *vm),
> >              TP_ARGS(vm)
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index e4311c48cc54..1966a583b761 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -514,6 +514,24 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm,
> >  
> >  #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
> >  
> > +static void xe_vm_kill(struct xe_vm *vm)
> > +{
> > +       struct ww_acquire_ctx ww;
> > +       struct xe_engine *e;
> > +
> > +       lockdep_assert_held(&vm->lock);
> > +
> > +       xe_vm_lock(vm, &ww, 0, false);
> > +       vm->flags |= XE_VM_FLAG_BANNED;
> > +       trace_xe_vm_kill(vm);
> > +
> > +       list_for_each_entry(e, &vm->preempt.engines, compute.link)
> > +               e->ops->kill(e);
> > +       xe_vm_unlock(vm, &ww);
> > +
> > +       /* TODO: Inform user the VM is banned */
> > +}
> > +
> >  static void preempt_rebind_work_func(struct work_struct *w)
> >  {
> >         struct xe_vm *vm = container_of(w, struct xe_vm,
> > preempt.rebind_work);
> > @@ -533,7 +551,7 @@ static void preempt_rebind_work_func(struct
> > work_struct *w)
> >         XE_BUG_ON(!xe_vm_in_compute_mode(vm));
> >         trace_xe_vm_rebind_worker_enter(vm);
> >  
> > -       if (xe_vm_is_closed(vm)) {
> > +       if (xe_vm_is_closed_or_banned(vm)) {
> >                 trace_xe_vm_rebind_worker_exit(vm);
> >                 return;
> >         }
> > @@ -662,11 +680,12 @@ static void preempt_rebind_work_func(struct
> > work_struct *w)
> >                         goto retry;
> >                 }
> >         }
> > +       if (err)
> > +               xe_vm_kill(vm);
> >         up_write(&vm->lock);
> >  
> >         free_preempt_fences(&preempt_fences);
> >  
> > -       XE_WARN_ON(err < 0);    /* TODO: Kill VM or put in error
> > state */
> >         trace_xe_vm_rebind_worker_exit(vm);
> >  }
> >  
> > @@ -1127,7 +1146,7 @@ xe_vm_find_overlapping_vma(struct xe_vm *vm,
> > const struct xe_vma *vma)
> >  {
> >         struct rb_node *node;
> >  
> > -       if (xe_vm_is_closed(vm))
> > +       if (xe_vm_is_closed_or_banned(vm))
> >                 return NULL;
> >  
> >         XE_BUG_ON(vma->end >= vm->size);
> > @@ -3048,7 +3067,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev,
> > void *data, struct drm_file *file)
> >                 goto free_objs;
> >         }
> >  
> > -       if (XE_IOCTL_ERR(xe, xe_vm_is_closed(vm))) {
> > +       if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
> >                 drm_err(dev, "VM closed while we began looking
> > up?\n");
> >                 err = -ENOENT;
> >                 goto put_vm;
> > diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
> > index 372f26153209..858207a20017 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.h
> > +++ b/drivers/gpu/drm/xe/xe_vm.h
> > @@ -49,6 +49,16 @@ static inline bool xe_vm_is_closed(struct xe_vm
> > *vm)
> >         return !vm->size;
> >  }
> >  
> > +static inline bool xe_vm_is_banned(struct xe_vm *vm)
> > +{
> > +       return vm->flags & XE_VM_FLAG_BANNED;
> > +}
> > +
> > +static inline bool xe_vm_is_closed_or_banned(struct xe_vm *vm)
> > +{
> > +       return xe_vm_is_closed(vm) || xe_vm_is_banned(vm);
> > +}
> > +
> >  struct xe_vma *
> >  xe_vm_find_overlapping_vma(struct xe_vm *vm, const struct xe_vma
> > *vma);
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c
> > b/drivers/gpu/drm/xe/xe_vm_madvise.c
> > index 0f5eef337037..76458f8d57f3 100644
> > --- a/drivers/gpu/drm/xe/xe_vm_madvise.c
> > +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
> > @@ -313,7 +313,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev,
> > void *data,
> >         if (XE_IOCTL_ERR(xe, !vm))
> >                 return -EINVAL;
> >  
> > -       if (XE_IOCTL_ERR(xe, xe_vm_is_closed(vm))) {
> > +       if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
> >                 err = -ENOENT;
> >                 goto put_vm;
> >         }
> > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
> > b/drivers/gpu/drm/xe/xe_vm_types.h
> > index 76af6ac0fa84..566b67abe1c3 100644
> > --- a/drivers/gpu/drm/xe/xe_vm_types.h
> > +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> > @@ -178,8 +178,9 @@ struct xe_vm {
> >  #define XE_VM_FLAG_MIGRATION           BIT(3)
> >  #define XE_VM_FLAG_SCRATCH_PAGE                BIT(4)
> >  #define XE_VM_FLAG_FAULT_MODE          BIT(5)
> > -#define XE_VM_FLAG_GT_ID(flags)                (((flags) >> 6) &
> > 0x3)
> > -#define XE_VM_FLAG_SET_TILE_ID(tile)   ((tile)->id << 6)
> > +#define XE_VM_FLAG_BANNED              BIT(6)
> > +#define XE_VM_FLAG_GT_ID(flags)                (((flags) >> 7) &
> > 0x3)
> > +#define XE_VM_FLAG_SET_TILE_ID(tile)   ((tile)->id << 7)
> >         unsigned long flags;
> >  
> >         /** @composite_fence_ctx: context composite fence */
> 


More information about the Intel-xe mailing list