[PATCH 21/23] drm/xe/svm: GPU page fault support

Tue Jan 23 03:09:08 UTC 2024

> -----Original Message-----
> From: Welty, Brian <brian.welty at intel.com>
> Sent: Monday, January 22, 2024 9:06 PM
> To: Zeng, Oak <oak.zeng at intel.com>; dri-devel at lists.freedesktop.org; intel-
> xe at lists.freedesktop.org
> Cc: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray at intel.com>; Thomas.Hellstrom at linux.intel.com;
> Vishwanathapura, Niranjana <niranjana.vishwanathapura at intel.com>; Brost,
> Matthew <matthew.brost at intel.com>
> Subject: Re: [PATCH 21/23] drm/xe/svm: GPU page fault support
> 
> 
> On 1/17/2024 2:12 PM, Oak Zeng wrote:
> > On gpu page fault of a virtual address, try to fault in the virtual
> > address range to gpu page table and let HW to retry on the faulty
> > address.
> >
> > Right now, we always migrate the whole vma which contains the fault
> > address to GPU. This is subject to change of a more sophisticated
> > migration policy: decide whether to migrate memory to GPU or map
> > in place with CPU memory; migration granularity.
> >
> > There is rather complicated locking strategy in this patch. See more
> > details in xe_svm_doc.h, lock design section.
> >
> > Signed-off-by: Oak Zeng <oak.zeng at intel.com>
> > Cc: Niranjana Vishwanathapura <niranjana.vishwanathapura at intel.com>
> > Cc: Matthew Brost <matthew.brost at intel.com>
> > Cc: Thomas Hellström <thomas.hellstrom at intel.com>
> > Cc: Brian Welty <brian.welty at intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_gt_pagefault.c |   7 ++
> >   drivers/gpu/drm/xe/xe_svm.c          | 116 +++++++++++++++++++++++++++
> >   drivers/gpu/drm/xe/xe_svm.h          |   6 ++
> >   drivers/gpu/drm/xe/xe_svm_range.c    |  43 ++++++++++
> >   4 files changed, 172 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > index 467d68f8332e..462603abab8a 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > @@ -22,6 +22,7 @@
> >   #include "xe_pt.h"
> >   #include "xe_trace.h"
> >   #include "xe_vm.h"
> > +#include "xe_svm.h"
> >
> >   enum fault_type {
> >   	NOT_PRESENT = 0,
> > @@ -131,6 +132,11 @@ static int handle_pagefault(struct xe_gt *gt, struct
> pagefault *pf)
> >   	if (!vm || !xe_vm_in_fault_mode(vm))
> >   		return -EINVAL;
> >
> > +	if (vm->svm) {
> > +		ret = xe_svm_handle_gpu_fault(vm, gt, pf);
> > +		goto put_vm;
> > +	}
> > +
> >   retry_userptr:
> >   	/*
> >   	 * TODO: Avoid exclusive lock if VM doesn't have userptrs, or
> > @@ -219,6 +225,7 @@ static int handle_pagefault(struct xe_gt *gt, struct
> pagefault *pf)
> >   		if (ret >= 0)
> >   			ret = 0;
> >   	}
> > +put_vm:
> >   	xe_vm_put(vm);
> >
> >   	return ret;
> > diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> > index 0c13690a19f5..1ade8d7f0ab2 100644
> > --- a/drivers/gpu/drm/xe/xe_svm.c
> > +++ b/drivers/gpu/drm/xe/xe_svm.c
> > @@ -12,6 +12,7 @@
> >   #include "xe_svm.h"
> >   #include <linux/hmm.h>
> >   #include <linux/scatterlist.h>
> > +#include <drm/xe_drm.h>
> >   #include "xe_pt.h"
> >   #include "xe_assert.h"
> >   #include "xe_vm_types.h"
> > @@ -206,3 +207,118 @@ static int svm_populate_range(struct xe_svm_range
> *svm_range,
> >   		kvfree(pfns);
> >   	return ret;
> >   }
> > +
> > +/**
> > + * svm_access_allowed() -  Determine whether read or/and write to vma is
> allowed
> > + *
> > + * @write: true means a read and write access; false: read only access
> > + */
> > +static bool svm_access_allowed(struct vm_area_struct *vma, bool write)
> > +{
> > +	unsigned long access = VM_READ;
> > +
> > +	if (write)
> > +		access |= VM_WRITE;
> > +
> > +	return (vma->vm_flags & access) == access;
> > +}
> > +
> > +/**
> > + * svm_should_migrate() - Determine whether we should migrate a range to
> > + * a destination memory region
> > + *
> > + * @range: The svm memory range to consider
> > + * @dst_region: target destination memory region
> > + * @is_atomic_fault: Is the intended migration triggered by a atomic access?
> > + * On some platform, we have to migrate memory to guarantee atomic
> correctness.
> > + */
> > +static bool svm_should_migrate(struct xe_svm_range *range,
> > +				struct xe_mem_region *dst_region, bool
> is_atomic_fault)
> > +{
> > +	return true;
> > +}
> > +
> > +/**
> > + * xe_svm_handle_gpu_fault() - gpu page fault handler for svm subsystem
> > + *
> > + * @vm: The vm of the fault.
> > + * @gt: The gt hardware on which the fault happens.
> > + * @pf: page fault descriptor
> > + *
> > + * Workout a backing memory for the fault address, migrate memory from
> > + * system memory to gpu vram if nessary, and map the fault address to
> > + * GPU so GPU HW can retry the last operation which has caused the GPU
> > + * page fault.
> > + */
> > +int xe_svm_handle_gpu_fault(struct xe_vm *vm,
> > +				struct xe_gt *gt,
> > +				struct pagefault *pf)
> > +{
> > +	u8 access_type = pf->access_type;
> > +	u64 page_addr = pf->page_addr;
> > +	struct hmm_range hmm_range;
> > +	struct vm_area_struct *vma;
> > +	struct xe_svm_range *range;
> > +	struct mm_struct *mm;
> > +	struct xe_svm *svm;
> > +	int ret = 0;
> > +
> > +	svm = vm->svm;
> > +	if (!svm)
> > +		return -EINVAL;
> > +
> > +	mm = svm->mm;
> > +	mmap_read_lock(mm);
> > +	vma = find_vma_intersection(mm, page_addr, page_addr + 4);
> > +	if (!vma) {
> > +		mmap_read_unlock(mm);
> > +		return -ENOENT;
> > +	}
> > +
> > +	if (!svm_access_allowed (vma, access_type != ACCESS_TYPE_READ)) {
> > +		mmap_read_unlock(mm);
> > +		return -EPERM;
> > +	}
> > +
> > +	range = xe_svm_range_from_addr(svm, page_addr);
> > +	if (!range) {
> > +		range = xe_svm_range_create(svm, vma);
> > +		if (!range) {
> > +			mmap_read_unlock(mm);
> > +			return -ENOMEM;
> > +		}
> > +	}
> > +
> > +	if (svm_should_migrate(range, &gt->tile->mem.vram,
> > +						access_type ==
> ACCESS_TYPE_ATOMIC))
> > +		/** Migrate whole svm range for now.
> > +		 *  This is subject to change once we introduce a migration
> granularity
> > +		 *  parameter for user to select.
> > +		 *
> > +		 *	Migration is best effort. If we failed to migrate to vram,
> > +		 *	we just map that range to gpu in system memory. For
> cases
> > +		 *	such as gpu atomic operation which requires memory to
> be
> > +		 *	resident in vram, we will fault again and retry migration.
> > +		 */
> > +		svm_migrate_range_to_vram(range, vma, gt->tile);
> > +
> > +	ret = svm_populate_range(range, &hmm_range, vma->vm_flags &
> VM_WRITE);
> > +	mmap_read_unlock(mm);
> > +	/** There is no need to destroy this range. Range can be reused later */
> > +	if (ret)
> > +		goto free_pfns;
> > +
> > +	/**FIXME: set the DM, AE flags in PTE*/
> > +	ret = xe_bind_svm_range(vm, gt->tile, &hmm_range,
> > +		!(vma->vm_flags & VM_WRITE) ?
> DRM_XE_VM_BIND_FLAG_READONLY : 0);
> > +	/** Concurrent cpu page table update happened,
> > +	 *  Return successfully so we will retry everything
> > +	 *  on next gpu page fault.
> > +	 */
> > +	if (ret == -EAGAIN)
> > +		ret = 0;
> > +
> > +free_pfns:
> > +	kvfree(hmm_range.hmm_pfns);
> > +	return ret;
> > +}
> > diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
> > index 659bcb7927d6..a8ff4957a9b8 100644
> > --- a/drivers/gpu/drm/xe/xe_svm.h
> > +++ b/drivers/gpu/drm/xe/xe_svm.h
> > @@ -20,6 +20,7 @@
> >
> >   struct xe_vm;
> >   struct mm_struct;
> > +struct pagefault;
> >
> >   #define XE_MAX_SVM_PROCESS 5 /* Maximumly support 32 SVM process*/
> >   extern DECLARE_HASHTABLE(xe_svm_table, XE_MAX_SVM_PROCESS);
> > @@ -94,6 +95,8 @@ bool xe_svm_range_belongs_to_vma(struct mm_struct
> *mm,
> >   void xe_svm_range_unregister_mmu_notifier(struct xe_svm_range *range);
> >   int xe_svm_range_register_mmu_notifier(struct xe_svm_range *range);
> >   void xe_svm_range_prepare_destroy(struct xe_svm_range *range);
> > +struct xe_svm_range *xe_svm_range_create(struct xe_svm *svm,
> > +									struct
> vm_area_struct *vma);
> >
> >   int xe_svm_build_sg(struct hmm_range *range, struct sg_table *st);
> >   int xe_svm_devm_add(struct xe_tile *tile, struct xe_mem_region *mem);
> > @@ -106,4 +109,7 @@ int xe_devm_alloc_pages(struct xe_tile *tile,
> >
> >   void xe_devm_free_blocks(struct list_head *blocks);
> >   void xe_devm_page_free(struct page *page);
> > +int xe_svm_handle_gpu_fault(struct xe_vm *vm,
> > +				struct xe_gt *gt,
> > +				struct pagefault *pf);
> >   #endif
> > diff --git a/drivers/gpu/drm/xe/xe_svm_range.c
> b/drivers/gpu/drm/xe/xe_svm_range.c
> > index dfb4660dc26f..05c088dddc2d 100644
> > --- a/drivers/gpu/drm/xe/xe_svm_range.c
> > +++ b/drivers/gpu/drm/xe/xe_svm_range.c
> > @@ -182,3 +182,46 @@ void xe_svm_range_prepare_destroy(struct
> xe_svm_range *range)
> >   	xe_invalidate_svm_range(vm, range->start, length);
> >   	xe_svm_range_unregister_mmu_notifier(range);
> >   }
> > +
> > +static void add_range_to_svm(struct xe_svm_range *range)
> > +{
> > +	range->inode.start = range->start;
> > +	range->inode.last = range->end;
> > +	mutex_lock(&range->svm->mutex);
> > +	interval_tree_insert(&range->inode, &range->svm->range_tree);
> > +	mutex_unlock(&range->svm->mutex);
> > +}
> 
> I have following question / concern.
> 
> I believe we are planning for what we call 'shared allocations' to use
> svm.  But what we call device-only allocations, will continue to use
> GEM_CREATE and those are in the BO-centric world.
> 
> But you need to still have the application with one single managed
> address space, yes?  In other words, how will theses co-exist?
> It seems you will have collisions.

Yes, those two types of allocators have to co-exist.

> 
> For example as hmm_range_fault brings a range from host into GPU address
> space,  what if it was already allocated and in use by VM_BIND for
> a GEM_CREATE allocated buffer?    That is of course application error,
> but KMD needs to detect it, and provide one single managed address
> space across all allocations from the application....

This is very good question. Yes agree we should check this application error. Fortunately this is doable. All vm_bind virtual address range are tracked in xe_vm/drm_gpuvm struct. In this case, we should iterate the drm_gpuvm's rb tree of *all* gpu devices (as xe_vm is for one device only) to see whether there is a conflict. Will make the change soon.

> 
> Continuing on this theme.  Instead of this interval tree, did you
> consider to just use drm_gpuvm as address space manager?
> It probably needs some overhaul, and not to assume it is managing only
> BO backed allocations, but could work....
> And it has all the split/merge support already there, which you will
> need for adding hints later?

Yes another good point. I discuss the approach of leveraging drm_gpuvm with Matt Brost. Yes the good thing is we can leverage all the range split/merge utilities there.

The difficulty to use drm_gpuvm is, today xe_vm/drm_gpuvm are all per-device based (see the *dev pointer in each structure). But xe_svm should work across all gpu devices.... So it is hard for xe_svm to inherit from drm_gpuvm...

One approach Matt mentioned is, change the drm_gpuvm a little to make it work across gpu device. I think this should be doable. I looked at the dev pointer in drm_gpuvm, it didn't really use this parameter a lot. The dev pointer is used just to print some warning message, no real logic work...

So what we can do is, we remove the dev pointer from drm_gpuvm, and instead of have xe_vm to inherit from drm_gpuvm, we can have a drm_gpuvm pointer in xe_vm, and let xe_svm to inherit from drm_gpuvm. Matt pointed all those ideas to me. We thought we want to make svm work w/o changing xekmd base driver and drm as a first step. And try this idea as a second step...

But since you also have this idea, I will start to an email the the drm_gpuvm designer to query the feasibility. If it turns out the be feasible, I will it work in one step. Considering this will save some codes in the memory hint part, I think it worth the time considering it right now.

Thanks,
Oak

> 
> Wanted to hear your thoughts.
> 
> -Brian
> 
> 
> 
> > +
> > +/**
> > + * xe_svm_range_create() - create and initialize a svm range
> > + *
> > + * @svm: the svm that the range belongs to
> > + * @vma: the corresponding vma of the range
> > + *
> > + * Create range, add it to svm's interval tree. Regiter a mmu
> > + * interval notifier for this range.
> > + *
> > + * Return the pointer of the created svm range
> > + * or NULL if fail
> > + */
> > +struct xe_svm_range *xe_svm_range_create(struct xe_svm *svm,
> > +									struct
> vm_area_struct *vma)
> > +{
> > +	struct xe_svm_range *range = kzalloc(sizeof(*range), GFP_KERNEL);
> > +
> > +	if (!range)
> > +		return NULL;
> > +
> > +	range->start = vma->vm_start;
> > +	range->end = vma->vm_end;
> > +	range->vma = vma;
> > +	range->svm = svm;
> > +
> > +	if (xe_svm_range_register_mmu_notifier(range)){
> > +		kfree(range);
> > +		return NULL;
> > +	}
> > +
> > +	add_range_to_svm(range);
> > +	return range;
> > +}