[PATCH V3] drm/xe/mmap: Add mmap support for PCI memory barrier

Wed Oct 23 11:00:03 UTC 2024

> -----Original Message-----
> From: Auld, Matthew <matthew.auld at intel.com>
> Sent: Wednesday, October 23, 2024 3:41 PM
> To: Upadhyay, Tejas <tejas.upadhyay at intel.com>; intel-
> xe at lists.freedesktop.org
> Cc: Mrozek, Michal <michal.mrozek at intel.com>
> Subject: Re: [PATCH V3] drm/xe/mmap: Add mmap support for PCI memory
> barrier
> 
> On 22/10/2024 09:36, Upadhyay, Tejas wrote:
> >
> >
> >> -----Original Message-----
> >> From: Auld, Matthew <matthew.auld at intel.com>
> >> Sent: Monday, October 21, 2024 9:03 PM
> >> To: Upadhyay, Tejas <tejas.upadhyay at intel.com>; intel-
> >> xe at lists.freedesktop.org
> >> Cc: Mrozek, Michal <michal.mrozek at intel.com>
> >> Subject: Re: [PATCH V3] drm/xe/mmap: Add mmap support for PCI
> memory
> >> barrier
> >>
> >> On 21/10/2024 07:11, Tejas Upadhyay wrote:
> >>> In order to avoid having userspace to use MI_MEM_FENCE, we are
> >>> adding a mechanism for userspace to generate a PCI memory barrier
> >>> with low overhead (avoiding IOCTL call as well as writing to VRAM
> >>> will adds some overhead).
> >>>
> >>> This is implemented by memory-mapping a page as uncached that is
> >>> backed by MMIO on the dGPU and thus allowing userspace to do
> memory
> >>> write to the page without invoking an IOCTL.
> >>> We are selecting the MMIO so that it is not accessible from the PCI
> >>> bus so that the MMIO writes themselves are ignored, but the PCI
> >>> memory barrier will still take action as the MMIO filtering will
> >>> happen after the memory barrier effect.
> >>>
> >>> When we detect special defined offset in mmap(), We are mapping 4K
> >>> page which contains the last of page of doorbell MMIO range to
> >>> userspace for same purpose.
> >>>
> >>> For user to query special offset we are adding special flag in
> >>> mmap_offset ioctl which needs to be passed as follows,
> >>>    struct drm_xe_gem_mmap_offset mmo = {
> >>>                   .handle = 0, /* this must be 0 */
> >>>                   .flags = DRM_XE_MMAP_OFFSET_FLAG_PCI_BARRIER,
> >>>           };
> >>>    igt_ioctl(fd, DRM_IOCTL_XE_GEM_MMAP_OFFSET, &mmo);
> >>>    map = mmap(NULL, size, PROT_WRITE, MAP_SHARED, fd, mmo);
> >>
> >> We should probably take some of this and copy it into the kernel-doc
> >> in the uapi header? Future user is probably first looking at the uapi
> >> header, which means they miss out on the above or have go hunting for
> >> more info in commit messages.
> >
> > Ah, correct I will make sure its in kernel-doc in uapi header.
> >
> >>
> >>>
> >>> Note: Test coverage for this is added by IGT
> >>>         https://patchwork.freedesktop.org/patch/618931/  here.
> >>>
> >>> V3(MAuld)
> >>>     - Remove offset defination from UAPI to be able to change later
> >>>     - Edit commit message for special flag addition
> >>> V2(MAuld)
> >>>     - Add fault handler with dummy page to handle unplug device
> >>>     - Add Build check for special offset to be below normal start page
> >>>     - Test d3hot, mapping seems to be valid in d3hot as well
> >>>     - Add more info to commit message
> >>>
> >>> Cc: Matthew Auld <matthew.auld at intel.com>
> >>> Cc: Michal Mrozek <michal.mrozek at intel.com>
> >>> Signed-off-by: Tejas Upadhyay <tejas.upadhyay at intel.com>
> >>> ---
> >>>    drivers/gpu/drm/xe/xe_bo.c     |  10 ++-
> >>>    drivers/gpu/drm/xe/xe_bo.h     |   2 +
> >>>    drivers/gpu/drm/xe/xe_device.c | 109
> >> ++++++++++++++++++++++++++++++++-
> >>>    include/uapi/drm/xe_drm.h      |   3 +-
> >>>    4 files changed, 121 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> >>> index d5d30a0ff1e7..078f92eb0947 100644
> >>> --- a/drivers/gpu/drm/xe/xe_bo.c
> >>> +++ b/drivers/gpu/drm/xe/xe_bo.c
> >>> @@ -2132,9 +2132,17 @@ int xe_gem_mmap_offset_ioctl(struct
> >> drm_device *dev, void *data,
> >>>    	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> >>>    		return -EINVAL;
> >>>
> >>> -	if (XE_IOCTL_DBG(xe, args->flags))
> >>> +	if (XE_IOCTL_DBG(xe, args->flags &
> >>> +			 ~DRM_XE_MMAP_OFFSET_FLAG_PCI_BARRIER))
> >>>    		return -EINVAL;
> >>>
> >>> +	if (args->flags & DRM_XE_MMAP_OFFSET_FLAG_PCI_BARRIER) {
> >>> +		if (XE_IOCTL_DBG(xe, args->handle))
> >>> +			return -EINVAL;
> >>> +		args->offset = XE_PCI_BARRIER_MMAP_OFFSET;
> >>> +		return 0;
> >>> +	}
> >>> +
> >>>    	gem_obj = drm_gem_object_lookup(file, args->handle);
> >>>    	if (XE_IOCTL_DBG(xe, !gem_obj))
> >>>    		return -ENOENT;
> >>> diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h
> >>> index 41624159a291..b36d1fba54c1 100644
> >>> --- a/drivers/gpu/drm/xe/xe_bo.h
> >>> +++ b/drivers/gpu/drm/xe/xe_bo.h
> >>> @@ -63,6 +63,8 @@
> >>>
> >>>    #define XE_BO_PROPS_INVALID	(-1)
> >>>
> >>> +#define XE_PCI_BARRIER_MMAP_OFFSET	(0x50 << XE_PTE_SHIFT)
> >>> +
> >>>    struct sg_table;
> >>>
> >>>    struct xe_bo *xe_bo_alloc(void);
> >>> diff --git a/drivers/gpu/drm/xe/xe_device.c
> >> b/drivers/gpu/drm/xe/xe_device.c
> >>> index 2da4affe4dfd..8a403f6a068d 100644
> >>> --- a/drivers/gpu/drm/xe/xe_device.c
> >>> +++ b/drivers/gpu/drm/xe/xe_device.c
> >>> @@ -239,12 +239,119 @@ static long xe_drm_compat_ioctl(struct file
> >>> *file,
> >> unsigned int cmd, unsigned lo
> >>>    #define xe_drm_compat_ioctl NULL
> >>>    #endif
> >>>
> >>> +static void barrier_open(struct vm_area_struct *vma) {
> >>> +	drm_dev_get(vma->vm_private_data);
> >>> +}
> >>> +
> >>> +static void barrier_close(struct vm_area_struct *vma) {
> >>> +	drm_dev_put(vma->vm_private_data);
> >>> +}
> >>> +
> >>> +static void barrier_release_dummy_page(struct drm_device *dev, void
> >> *res)
> >>> +{
> >>> +	struct page *dummy_page = (struct page *)res;
> >>> +
> >>> +	__free_page(dummy_page);
> >>> +}
> >>> +
> >>> +static vm_fault_t barrier_fault(struct vm_fault *vmf) {
> >>> +	struct drm_device *dev = vmf->vma->vm_private_data;
> >>> +	struct vm_area_struct *vma = vmf->vma;
> >>> +	vm_fault_t ret = VM_FAULT_NOPAGE;
> >>> +	unsigned long address;
> >>> +	unsigned long pfn;
> >>> +	struct page *page;
> >>> +	pgprot_t prot;
> >>> +
> >>> +	prot = vm_get_page_prot(vma->vm_flags);
> >>> +
> >>> +	/* Allocate new dummy page to map all the VA range in this VMA to
> >> it*/
> >>> +	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> >>> +	if (!page)
> >>> +		return VM_FAULT_OOM;
> >>> +
> >>> +	/* Set the page to be freed using drmm release action */
> >>> +	if (drmm_add_action_or_reset(dev, barrier_release_dummy_page,
> >> page))
> >>> +		return VM_FAULT_OOM;
> >>> +
> >>> +	pfn = page_to_pfn(page);
> >>> +
> >>> +	/* Prefault the entire VMA range right away to avoid further faults */
> >>> +	for (address = vma->vm_start; address < vma->vm_end;
> >>> +	     address += XE_PAGE_SIZE)
> >>> +		ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> >>> +
> >>> +	return ret;
> >>> +}
> >>> +
> >>> +static const struct vm_operations_struct vm_ops_barrier = {
> >>> +	.open = barrier_open,
> >>> +	.close = barrier_close,
> >>> +	.fault = barrier_fault,
> >>> +};
> >>> +
> >>> +static int xe_pci_barrier_mmap(struct file *filp,
> >>> +			       struct vm_area_struct *vma) {
> >>> +	struct drm_file *priv = filp->private_data;
> >>> +	struct drm_device *dev = priv->minor->dev;
> >>> +	unsigned long pfn;
> >>> +	pgprot_t prot;
> >>> +
> >>> +	if (vma->vm_end - vma->vm_start > XE_PAGE_SIZE)
> >>
> >> One doubt here, what happens when PAGE_SIZE > XE_PAGE_SIZE? Does
> this
> >> all still work on such a machine? I assume the vma is always a
> >> multiple of PAGE_SIZE.
> >
> > This is indeed a valid doubt, with that I think we should not use PAGE_SIZE
> even, we should use SZ_4K, as we don’t have required protection beyond 4K
> page and system PAGE_SIZE can be vary where we can be into unsafe region.
> >
> > I will replace with SZ_4K everywhere and resend next version.
> 
> I think XE_PAGE_SIZE is already hardcoded to 4K since this is more GPU side.
> My point was more that on some machines the CPU PAGE_SIZE could be say
> 64K, and seems like with this uapi it will not be usable/portable since vma size
> here would always be multiple of PAGE_SIZE and the mmap here will then
> always reject it? It looks like this is targeted at dgpu so host CPU could be
> anything really.

It has to be 4K CPU PAGE SIZE to support the feature otherwise it does not support it on such platforms. HW limit.

Tejas 
> 
> >
> > Tejas
> >
> >>
> >>> +		return -EINVAL;
> >>> +
> >>> +	if (is_cow_mapping(vma->vm_flags))
> >>> +		return -EINVAL;
> >>> +
> >>> +	if (vma->vm_flags & (VM_READ | VM_EXEC))
> >>> +		return -EINVAL;
> >>> +
> >>> +	vm_flags_clear(vma, VM_MAYREAD | VM_MAYEXEC);
> >>> +	vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND |
> >> VM_DONTDUMP | VM_IO);
> >>> +
> >>> +	prot = vm_get_page_prot(vma->vm_flags); #define
> >>> +LAST_DB_PAGE_OFFSET 0x7ff001
> >>> +	pfn = PHYS_PFN(pci_resource_start(to_pci_dev(dev->dev), 0) +
> >>> +			LAST_DB_PAGE_OFFSET);
> >>> +	if (vmf_insert_pfn_prot(vma, vma->vm_start, pfn,
> >>> +				pgprot_noncached(prot)) !=
> >> VM_FAULT_NOPAGE)
> >>> +		return -EFAULT;
> >>> +
> >>> +	vma->vm_ops = &vm_ops_barrier;
> >>> +	vma->vm_private_data = dev;
> >>> +	drm_dev_get(vma->vm_private_data);
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static int xe_mmap(struct file *filp, struct vm_area_struct *vma) {
> >>> +	struct drm_file *priv = filp->private_data;
> >>> +	struct drm_device *dev = priv->minor->dev;
> >>> +
> >>> +	if (drm_dev_is_unplugged(dev))
> >>> +		return -ENODEV;
> >>> +
> >>> +	BUILD_BUG_ON(((XE_PCI_BARRIER_MMAP_OFFSET >>
> >> XE_PTE_SHIFT) +
> >>> +		      XE_PAGE_SIZE) >= DRM_FILE_PAGE_OFFSET_START);
> >>> +
> >>> +	switch (vma->vm_pgoff) {
> >>> +	case XE_PCI_BARRIER_MMAP_OFFSET >> XE_PTE_SHIFT:
> >>> +		return xe_pci_barrier_mmap(filp, vma);
> >>> +	}
> >>> +
> >>> +	return drm_gem_mmap(filp, vma);
> >>> +}
> >>> +
> >>>    static const struct file_operations xe_driver_fops = {
> >>>    	.owner = THIS_MODULE,
> >>>    	.open = drm_open,
> >>>    	.release = drm_release_noglobal,
> >>>    	.unlocked_ioctl = xe_drm_ioctl,
> >>> -	.mmap = drm_gem_mmap,
> >>> +	.mmap = xe_mmap,
> >>>    	.poll = drm_poll,
> >>>    	.read = drm_read,
> >>>    	.compat_ioctl = xe_drm_compat_ioctl, diff --git
> >>> a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index
> >>> c4182e95a619..6b1b08a9f528 100644
> >>> --- a/include/uapi/drm/xe_drm.h
> >>> +++ b/include/uapi/drm/xe_drm.h
> >>> @@ -819,7 +819,8 @@ struct drm_xe_gem_mmap_offset {
> >>>    	/** @handle: Handle for the object being mapped. */
> >>>    	__u32 handle;
> >>>
> >>> -	/** @flags: Must be zero */
> >>> +#define DRM_XE_MMAP_OFFSET_FLAG_PCI_BARRIER     (1 << 0)
> >>> +	/** @flags: Flag to indicate if any special offset, zero otherwise
> >>> +*/
> >>>    	__u32 flags;
> >>>
> >>>    	/** @offset: The fake offset to use for subsequent mmap call */