[PATCH 1/1] drm/xe: Flush all page faults when closing a VM

Tue Sep 10 17:58:51 UTC 2024

On Tue, Sep 10, 2024 at 01:39:41PM +0530, Ghimiray, Himal Prasad wrote:
> 
> 
> On 10-09-2024 04:02, Matthew Brost wrote:
> > Ensure all page faults on VM are done when closing a VM before removing
> > page table memory.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_gt_pagefault.c | 15 +++++++++++++++
> >   drivers/gpu/drm/xe/xe_gt_pagefault.h |  2 ++
> >   drivers/gpu/drm/xe/xe_vm.c           |  7 +++++++
> >   3 files changed, 24 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > index 730eec07795e..2dd7065ce54a 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > @@ -304,6 +304,21 @@ static bool pf_queue_full(struct pf_queue *pf_queue)
> >   		PF_MSG_LEN_DW;
> >   }
> > +/**
> > + * xe_gt_pagefault_flush() - Flush page faults for a VM
> > + * @gt: the GT object
> > + * @vm: the VM objecy
> > + *
> > + * Ensure all page faults per GT and VM pair are done executing.
> > + */
> > +void xe_gt_pagefault_flush(struct xe_gt *gt, struct xe_vm *vm)
> > +{
> > +	struct pf_queue *pf_queue = gt->usm.pf_queue +
> > +		(vm->usm.asid % NUM_PF_QUEUE);
> > +
> > +	flush_work(&pf_queue->worker);
> 
> 
> wont this mean we will be delaying vm kill till pf worker is executed for
> other vm's using same pf queue ?
> 

Yes. Realized after I sent this we can just check for closed in PF
handler under the vm->lock. Let me just do that.

Matt

> 
> > +}
> > +
> >   int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >   {
> >   	struct xe_gt *gt = guc_to_gt(guc);
> > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.h b/drivers/gpu/drm/xe/xe_gt_pagefault.h
> > index 839c065a5e4c..9f4166617f04 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.h
> > @@ -10,8 +10,10 @@
> >   struct xe_gt;
> >   struct xe_guc;
> > +struct xe_vm;
> >   int xe_gt_pagefault_init(struct xe_gt *gt);
> > +void xe_gt_pagefault_flush(struct xe_gt *gt, struct xe_vm *vm);
> >   void xe_gt_pagefault_reset(struct xe_gt *gt);
> >   int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len);
> >   int xe_guc_access_counter_notify_handler(struct xe_guc *guc, u32 *msg, u32 len);
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index 7acd5fc9d032..5139a731ae79 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -1543,6 +1543,13 @@ void xe_vm_close_and_put(struct xe_vm *vm)
> >   	xe_vm_close(vm);
> >   	if (xe_vm_in_preempt_fence_mode(vm))
> >   		flush_work(&vm->preempt.rebind_work);
> > +	if (xe_vm_in_fault_mode(vm)) {
> > +		struct xe_gt *gt;
> > +
> > +		/* Stop all async faults */
> > +		for_each_gt(gt, xe, id)
> > +			xe_gt_pagefault_flush(gt, vm);
> > +	}
> >   	down_write(&vm->lock);
> >   	for_each_tile(tile, xe, id) {