[PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region

Xiong, Jianxin jianxin.xiong at intel.com
Tue Oct 27 20:33:52 UTC 2020


> -----Original Message-----
> From: Jason Gunthorpe <jgg at ziepe.ca>
> Sent: Tuesday, October 27, 2020 1:08 PM
> To: Xiong, Jianxin <jianxin.xiong at intel.com>
> Cc: linux-rdma at vger.kernel.org; dri-devel at lists.freedesktop.org; Doug Ledford <dledford at redhat.com>; Leon Romanovsky
> <leon at kernel.org>; Sumit Semwal <sumit.semwal at linaro.org>; Christian Koenig <christian.koenig at amd.com>; Vetter, Daniel
> <daniel.vetter at intel.com>
> Subject: Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region
> 
> On Fri, Oct 23, 2020 at 09:40:01AM -0700, Jianxin Xiong wrote:
> 
> > diff --git a/drivers/infiniband/hw/mlx5/mr.c
> > b/drivers/infiniband/hw/mlx5/mr.c index b261797..3bc412b 100644
> > +++ b/drivers/infiniband/hw/mlx5/mr.c
> > @@ -1,5 +1,6 @@
> >  /*
> >   * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
> > + * Copyright (c) 2020, Intel Corporation. All rights reserved.
> >   *
> >   * This software is available to you under a choice of one of two
> >   * licenses.  You may choose to be licensed under the terms of the
> > GNU @@ -36,6 +37,8 @@  #include <linux/debugfs.h>  #include
> > <linux/export.h>  #include <linux/delay.h>
> > +#include <linux/dma-buf.h>
> > +#include <linux/dma-resv.h>
> >  #include <rdma/ib_umem.h>
> >  #include <rdma/ib_umem_odp.h>
> >  #include <rdma/ib_verbs.h>
> > @@ -1113,6 +1116,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
> >  		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
> >  		if (mr->umem->is_odp) {
> >  			mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
> > +		} else if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) {
> > +			memset(xlt, 0, size);
> >  		} else {
> >  			__mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
> >  					       npages, xlt,
> > @@ -1462,6 +1467,111 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
> >  	return ERR_PTR(err);
> >  }
> >
> > +static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment
> > +*attach) {
> > +	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
> > +	struct mlx5_ib_mr *mr = umem_dmabuf->device_context;
> > +
> > +	mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP);
> > +	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
> > +}
> > +
> > +static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
> > +	.allow_peer2peer = 1,
> > +	.move_notify = mlx5_ib_dmabuf_invalidate_cb, };
> > +
> > +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
> > +					 u64 length, u64 virt_addr,
> > +					 int fd, int access_flags,
> > +					 struct ib_udata *udata)
> > +{
> > +	struct mlx5_ib_dev *dev = to_mdev(pd->device);
> > +	struct mlx5_ib_mr *mr = NULL;
> > +	struct ib_umem *umem;
> > +	struct ib_umem_dmabuf *umem_dmabuf;
> > +	int npages;
> > +	int order;
> > +	int err;
> > +
> > +	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
> > +		return ERR_PTR(-EOPNOTSUPP);
> > +
> > +	mlx5_ib_dbg(dev,
> > +		    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
> > +		    offset, virt_addr, length, fd, access_flags);
> > +
> > +	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, access_flags,
> > +				  &mlx5_ib_dmabuf_attach_ops);
> > +	if (IS_ERR(umem)) {
> > +		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
> > +		return ERR_PTR(PTR_ERR(umem));
> > +	}
> > +
> > +	npages = ib_umem_num_pages(umem);
> > +	if (!npages) {
> 
> ib_umem_get should reject invalid umems like this

Will move the check to ib_umem_dmabuf_get().

> 
> > +		mlx5_ib_warn(dev, "avoid zero region\n");
> > +		ib_umem_release(umem);
> > +		return ERR_PTR(-EINVAL);
> > +	}
> > +
> > +	order = ilog2(roundup_pow_of_two(npages));
> 
> Must always call ib_umem_find_best_pgsz(), specify PAGE_SIZE as the argument for this scenario

Will fix.

> 
> > +	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
> > +		    npages, npages, order, PAGE_SHIFT);
> > +
> > +	mr = alloc_mr_from_cache(pd, umem, virt_addr, length, npages,
> > +				 PAGE_SHIFT, order, access_flags);
> > +	if (IS_ERR(mr))
> > +		mr = NULL;
> > +
> > +	if (!mr) {
> > +		mutex_lock(&dev->slow_path_mutex);
> > +		mr = reg_create(NULL, pd, virt_addr, length, umem, npages,
> > +				PAGE_SHIFT, access_flags, false);
> > +		mutex_unlock(&dev->slow_path_mutex);
> > +	}
> 
> Rebase on the mlx5 series just posted and use their version of this code sequence, this is just not quite right

Will do.

> 
> 
> > +	err = mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT,
> > +				 MLX5_IB_UPD_XLT_ENABLE | MLX5_IB_UPD_XLT_ZAP);
> > +
> > +	if (err) {
> > +		dereg_mr(dev, mr);
> > +		return ERR_PTR(err);
> > +	}
> 
> The current mlx5 code preloads the buffer with the right data, zapping is fairly expensive, mapping and loading is the same cost

Could probably do the same here. Will double check.

> 
> > @@ -1536,7 +1646,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
> >  	if (!mr->umem)
> >  		return -EINVAL;
> >
> > -	if (is_odp_mr(mr))
> > +	if (is_odp_mr(mr) || is_dmabuf_mr(mr))
> >  		return -EOPNOTSUPP;
> >
> >  	if (flags & IB_MR_REREG_TRANS) {
> > @@ -1695,7 +1805,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
> >  	struct ib_umem *umem = mr->umem;
> >
> >  	/* Stop all DMA */
> > -	if (is_odp_mr(mr))
> > +	if (is_odp_mr(mr) || is_dmabuf_mr(mr))
> >  		mlx5_ib_fence_odp_mr(mr);
> 
> Make a dma buf specific function

Ok.

> 
> I have another series touching this area too, but I think
> 
> > @@ -801,6 +816,52 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
> >   * Returns:
> >   *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
> >   *           not accessible, or the MR is no longer valid.
> > + *  -EAGAIN: The operation should be retried
> > + *
> > + *  >0: Number of pages mapped
> > + */
> > +static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, struct ib_umem *umem,
> > +			       u64 io_virt, size_t bcnt, u32 *bytes_mapped,
> > +			       u32 flags)
> > +{
> > +	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
> > +	u64 user_va;
> > +	u64 end;
> > +	int npages;
> > +	int err;
> > +
> > +	if (unlikely(io_virt < mr->mmkey.iova))
> > +		return -EFAULT;
> > +	if (check_add_overflow(io_virt - mr->mmkey.iova,
> > +			       (u64)umem->address, &user_va))
> > +		return -EFAULT;
> > +	/* Overflow has alreddy been checked at the umem creation time */
> > +	end = umem->address + umem->length;
> > +	if (unlikely(user_va >= end || end  - user_va < bcnt))
> > +		return -EFAULT;
> 
> Why duplicate this sequence? Caller does it

The sequence in the caller is for umem_odp only.

> 
> > @@ -811,6 +872,10 @@ static int pagefault_mr(struct mlx5_ib_mr *mr,
> > u64 io_virt, size_t bcnt,  {
> >  	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
> >
> > +	if (is_dmabuf_mr(mr))
> > +		return pagefault_dmabuf_mr(mr, mr->umem, io_virt, bcnt,
> > +					   bytes_mapped, flags);
> > +
> >  	lockdep_assert_held(&mr->dev->odp_srcu);
> >  	if (unlikely(io_virt < mr->mmkey.iova))
> >  		return -EFAULT;
> > @@ -1747,7 +1812,6 @@ static void destroy_prefetch_work(struct
> > prefetch_mr_work *work)  {
> >  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
> >  	struct mlx5_core_mkey *mmkey;
> > -	struct ib_umem_odp *odp;
> >  	struct mlx5_ib_mr *mr;
> >
> >  	lockdep_assert_held(&dev->odp_srcu);
> > @@ -1761,11 +1825,9 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
> >  	if (mr->ibmr.pd != pd)
> >  		return NULL;
> >
> > -	odp = to_ib_umem_odp(mr->umem);
> > -
> >  	/* prefetch with write-access must be supported by the MR */
> >  	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> > -	    !odp->umem.writable)
> > +	    !mr->umem->writable)
> 
> ??
There is no need to use umem_odp here, mr->umem is the same as &odp->umem. 
This change makes the code works for both umem_odp and umem_dmabuf.

> 
> This does look basically right though. I think a little more polishing and it can be merged. It does need to go after the mlx5 MR series
> though..
> 
> Jason


More information about the dri-devel mailing list