[PATCH 1/1] drm/xe: Flush all page faults when closing a VM

Tue Sep 10 08:09:41 UTC 2024

On 10-09-2024 04:02, Matthew Brost wrote:
> Ensure all page faults on VM are done when closing a VM before removing
> page table memory.
> 
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>   drivers/gpu/drm/xe/xe_gt_pagefault.c | 15 +++++++++++++++
>   drivers/gpu/drm/xe/xe_gt_pagefault.h |  2 ++
>   drivers/gpu/drm/xe/xe_vm.c           |  7 +++++++
>   3 files changed, 24 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index 730eec07795e..2dd7065ce54a 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -304,6 +304,21 @@ static bool pf_queue_full(struct pf_queue *pf_queue)
>   		PF_MSG_LEN_DW;
>   }
>   
> +/**
> + * xe_gt_pagefault_flush() - Flush page faults for a VM
> + * @gt: the GT object
> + * @vm: the VM objecy
> + *
> + * Ensure all page faults per GT and VM pair are done executing.
> + */
> +void xe_gt_pagefault_flush(struct xe_gt *gt, struct xe_vm *vm)
> +{
> +	struct pf_queue *pf_queue = gt->usm.pf_queue +
> +		(vm->usm.asid % NUM_PF_QUEUE);
> +
> +	flush_work(&pf_queue->worker);

wont this mean we will be delaying vm kill till pf worker is executed 
for other vm's using same pf queue ?

> +}
> +
>   int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
>   {
>   	struct xe_gt *gt = guc_to_gt(guc);
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.h b/drivers/gpu/drm/xe/xe_gt_pagefault.h
> index 839c065a5e4c..9f4166617f04 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.h
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.h
> @@ -10,8 +10,10 @@
>   
>   struct xe_gt;
>   struct xe_guc;
> +struct xe_vm;
>   
>   int xe_gt_pagefault_init(struct xe_gt *gt);
> +void xe_gt_pagefault_flush(struct xe_gt *gt, struct xe_vm *vm);
>   void xe_gt_pagefault_reset(struct xe_gt *gt);
>   int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len);
>   int xe_guc_access_counter_notify_handler(struct xe_guc *guc, u32 *msg, u32 len);
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 7acd5fc9d032..5139a731ae79 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1543,6 +1543,13 @@ void xe_vm_close_and_put(struct xe_vm *vm)
>   	xe_vm_close(vm);
>   	if (xe_vm_in_preempt_fence_mode(vm))
>   		flush_work(&vm->preempt.rebind_work);
> +	if (xe_vm_in_fault_mode(vm)) {
> +		struct xe_gt *gt;
> +
> +		/* Stop all async faults */
> +		for_each_gt(gt, xe, id)
> +			xe_gt_pagefault_flush(gt, vm);
> +	}
>   
>   	down_write(&vm->lock);
>   	for_each_tile(tile, xe, id) {