[PATCH v1 4/5] drm/xe/bo: Create a new sg for dmabuf BOs that are associated with a VF

Sun Oct 20 18:06:12 UTC 2024

On Tue, Oct 15, 2024 at 11:41:56PM -0600, Kasireddy, Vivek wrote:
> Hi Matt,
> 
> > 
> > On Fri, Oct 11, 2024 at 07:40:26PM -0700, Vivek Kasireddy wrote:
> > > For BOs of type ttm_bo_type_sg, that are backed by PCI BAR addresses
> > > associated with a VF, we need to adjust and translate these addresses
> > > to LMEM addresses to make the BOs usable by the PF. Otherwise, the BOs
> > > (i.e, PCI BAR addresses) are only accessible by the CPU and not by
> > > the GPU.
> > >
> > > In order to do the above, we first need to identify if the DMA addresses
> > > associated with an imported BO (type ttm_bo_type_sg) belong to System
> > > RAM or a VF or other PCI device. After we confirm that they belong to
> > > a VF, we convert the DMA addresses (IOVAs in this case) to DPAs and
> > > create a new sg and populate it with the new addresses.
> > 
> > I think using a SG list is a no-go. We have received pushback before [1]
> > about using a SG list as structure to hold DPA rather than dma-address.
> > The consensus was a SG list is not a generic structure to hold any
> > address [2], rather a specific structure for dma addressess.
> AFAIU, that would make sense if the SG list was exposed outside of the Xe
> driver but the SG list that is created by this patch is only used internally by
> the Xe driver. Would this still not be acceptable?
> 

I'd check with Thomas on this. I'm not hugely opposed, but if I remember
correctly, Thomas agrees with Jason in the links below that SG lists
shouldn't be used like this.

Certainly, making this internal to Xe makes it safer, though.

> > 
> > I'm pretty sure we will have define a new BO type (ttm_bo_type_devmem?)
> > and structure that can iterated on if we want to do something like this
> > unless we want to ignore the above feedback.
> Right, if using SG list is a no-go, then using some form of xe_res_cursor to
> iterate seems like the only other option.
>

Ah, yes. Since this is internal to Xe, it may be easy enough to use a
different cursor type here rather than touching TTM.

> > 
> > [1] https://patchwork.freedesktop.org/patch/574894/?series=128910&rev=1
> > [2]
> > https://patchwork.freedesktop.org/patch/574894/?series=128910&rev=1#co
> > mment_1070889
> > 
> > >
> > > Signed-off-by: Vivek Kasireddy <vivek.kasireddy at intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/xe_bo.c       | 108
> > ++++++++++++++++++++++++++++++-
> > >  drivers/gpu/drm/xe/xe_bo_types.h |   6 ++
> > >  2 files changed, 113 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> > > index c74c121ea7bb..64efe1b21f19 100644
> > > --- a/drivers/gpu/drm/xe/xe_bo.c
> > > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > > @@ -6,6 +6,7 @@
> > >  #include "xe_bo.h"
> > >
> > >  #include <linux/dma-buf.h>
> > > +#include <linux/iommu.h>
> > >
> > >  #include <drm/drm_drv.h>
> > >  #include <drm/drm_gem_ttm_helper.h>
> > > @@ -15,16 +16,19 @@
> > >  #include <drm/ttm/ttm_tt.h>
> > >  #include <uapi/drm/xe_drm.h>
> > >
> > > +#include "regs/xe_bars.h"
> > >  #include "xe_device.h"
> > >  #include "xe_dma_buf.h"
> > >  #include "xe_drm_client.h"
> > >  #include "xe_ggtt.h"
> > >  #include "xe_gt.h"
> > > +#include "xe_gt_sriov_pf_config.h"
> > >  #include "xe_map.h"
> > >  #include "xe_migrate.h"
> > >  #include "xe_pm.h"
> > >  #include "xe_preempt_fence.h"
> > >  #include "xe_res_cursor.h"
> > > +#include "xe_sriov_pf_helpers.h"
> > >  #include "xe_trace_bo.h"
> > >  #include "xe_ttm_stolen_mgr.h"
> > >  #include "xe_vm.h"
> > > @@ -543,6 +547,102 @@ static int xe_bo_trigger_rebind(struct xe_device
> > *xe, struct xe_bo *bo,
> > >  	return ret;
> > >  }
> > >
> > > +static struct pci_dev *xe_find_vf_dev(struct xe_device *xe,
> > > +				      phys_addr_t phys)
> > > +{
> > > +	struct pci_dev *pdev, *pf_pdev = to_pci_dev(xe->drm.dev);
> > > +	resource_size_t io_start, io_size;
> > > +
> > > +	list_for_each_entry(pdev, &pf_pdev->bus->devices, bus_list) {
> > > +		if (pdev->is_physfn)
> > > +			continue;
> > > +
> > > +		io_start = pci_resource_start(pdev, LMEM_BAR);
> > > +		io_size = pci_resource_len(pdev, LMEM_BAR);
> > > +
> > > +		if (phys >= io_start &&
> > > +		    phys < (io_start + io_size - PAGE_SIZE))
> > > +			return pdev;
> > > +	}
> > > +	return NULL;
> > > +}
> > > +
> > > +
> > > +static void xe_bo_translate_iova_to_dpa(struct xe_device *xe,
> > > +					struct sg_table *sg,
> > > +					struct sg_table *new_sg,
> > > +					struct pci_dev *pdev)
> > > +{
> > > +	resource_size_t io_start = pci_resource_start(pdev, LMEM_BAR);
> > > +	struct xe_gt *gt = xe_root_mmio_gt(xe);
> > > +	struct scatterlist *sgl, *new_sgl;
> > > +	int i, vfid = pci_iov_vf_id(pdev);
> > > +	dma_addr_t new_addr, bo_addr;
> > > +	struct iommu_domain *domain;
> > > +	phys_addr_t phys;
> > > +	u64 offset;
> > > +
> > > +	bo_addr = xe_gt_sriov_pf_config_get_lmem_addr(gt, ++vfid);
> > > +	domain = iommu_get_domain_for_dev(xe->drm.dev);
> > > +
> > > +	new_sgl = new_sg->sgl;
> > > +	for_each_sgtable_dma_sg(sg, sgl, i) {
> > 
> > I'm pretty sure this doesn't work if a single dma address of the input
> > 'sg' resolves to a non-contiguous physical DPA. In most cases this is
> > going to be contiguous though unless there is memory pressure or
> > unaligned allocations sizes. Assuming your testing didn't blow up, you
> > may not have hit a memory pressure situation where VRAM gets
> > fragmented.
> Yeah, I did not test exhaustively and also did not realize that that the BO
> (config->lmem_obj) backing the VF's lmem quota may not be contiguous.
> I'll try to figure out a way to test this scenario. However, for each SG entry,
> if I do something like:
> 
> offset = phys - io_start;
> new_addr = xe_bo_addr(config->lmem_obj, offset, sg_dma_len(sgl));
> 
> instead of
> 
> bo_addr = xe_bo_addr(config->lmem_obj, 0, PAGE_SIZE);
> offset = phys - io_start;
> new_addr = bo_addr + offset;
> 
> I believe this would probably work as expected even when lmem_obj is not
> contiguous (as it probes the buddy blocks) or if input SG has non-contiguous
> physical (DPA) ranges.
> 
> > 
> > I think only iommu_iova_to_phys is accurate for exactly 1 page unless
> > I'm missing something. See [3].
> IIUC, it seems to be valid for the segment of size sg_dma_len(sgl), for a given
> SG entry.
> 

So the SG list you are remapping here is set up in xe_dma_buf_map,
specifically for the VRAM case of xe_ttm_vram_mgr_alloc_sgt, right?

Yes, it appears each segment (sg_dma_len(sgl)) is a buddy block, so it
is contiguous. This does work, but it makes a lot of assumptions about
the lower layers, which is not ideal.

Each segment is still a DMA address, so it’s possible that
iommu_iova_to_phys is only accurate for exactly one page within the
segment. I'd prefer to code for that behavior, which will always work,
rather than making assumptions about a lower layer.

With all of the above, can we add a page-by-page DPA cursor? Thomas has
written one for SVM [4] [5]. That code will take a while to land, but I
think it can give you a template to build a cursor like this. Once that
lands, maybe we can combine these cursors.

Matt

[4] https://patchwork.freedesktop.org/patch/619812/?series=137870&rev=2
[5] https://patchwork.freedesktop.org/patch/619845/?series=137870&rev=2

> Thanks,
> Vivek
> 
> > 
> > [3]
> > https://elixir.bootlin.com/linux/v6.11.3/source/drivers/iommu/iommu.c#L2
> > 376
> > 
> > Matt
> > 
> > > +		phys = domain ? iommu_iova_to_phys(domain,
> > sg_dma_address(sgl)) :
> > > +			sg_dma_address(sgl);
> > > +		offset = phys - io_start;
> > > +		new_addr = bo_addr + offset;
> > > +
> > > +		sg_set_page(new_sgl, NULL, sg_dma_len(sgl), 0);
> > > +		sg_dma_address(new_sgl) = new_addr;
> > > +		sg_dma_len(new_sgl) = sg_dma_len(sgl);
> > > +
> > > +		new_sgl = sg_next(new_sgl);
> > > +	}
> > > +}
> > > +
> > > +static struct sg_table *xe_bo_create_new_sg(struct sg_table *sg,
> > > +					    struct xe_bo *bo)
> > > +{
> > > +	struct xe_device *xe = xe_bo_device(bo);
> > > +	struct iommu_domain *domain;
> > > +	struct sg_table *new_sg;
> > > +	struct pci_dev *pdev;
> > > +	phys_addr_t phys;
> > > +	int vfid;
> > > +
> > > +	if (!IS_SRIOV_PF(xe))
> > > +		return sg;
> > > +
> > > +	domain = iommu_get_domain_for_dev(xe->drm.dev);
> > > +	phys = domain ? iommu_iova_to_phys(domain, sg_dma_address(sg-
> > >sgl)) :
> > > +		sg_dma_address(sg->sgl);
> > > +
> > > +	if (page_is_ram(PFN_DOWN(phys)))
> > > +		return sg;
> > > +
> > > +	pdev = xe_find_vf_dev(xe, phys);
> > > +	if (!pdev)
> > > +		return sg;
> > > +
> > > +	vfid = pci_iov_vf_id(pdev);
> > > +	if (vfid < 0)
> > > +		return sg;
> > > +
> > > +	new_sg = kzalloc(sizeof(*new_sg), GFP_KERNEL);
> > > +	if (!new_sg)
> > > +		return sg;
> > > +
> > > +	if (sg_alloc_table(new_sg, sg->nents, GFP_KERNEL)) {
> > > +		kfree(new_sg);
> > > +		return sg;
> > > +	}
> > > +
> > > +	bo->is_devmem_external = true;
> > > +	xe_bo_translate_iova_to_dpa(xe, sg, new_sg, pdev);
> > > +
> > > +	return new_sg;
> > > +}
> > > +
> > >  /*
> > >   * The dma-buf map_attachment() / unmap_attachment() is hooked up
> > here.
> > >   * Note that unmapping the attachment is deferred to the next
> > > @@ -577,7 +677,7 @@ static int xe_bo_move_dmabuf(struct
> > ttm_buffer_object *ttm_bo,
> > >  		return PTR_ERR(sg);
> > >
> > >  	ttm_bo->sg = sg;
> > > -	xe_tt->sg = sg;
> > > +	xe_tt->sg = xe_bo_create_new_sg(sg, ttm_to_xe_bo(ttm_bo));
> > >
> > >  out:
> > >  	ttm_bo_move_null(ttm_bo, new_res);
> > > @@ -1066,6 +1166,8 @@ static void xe_ttm_bo_release_notify(struct
> > ttm_buffer_object *ttm_bo)
> > >
> > >  static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object
> > *ttm_bo)
> > >  {
> > > +	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
> > > +
> > >  	if (!xe_bo_is_xe_bo(ttm_bo))
> > >  		return;
> > >
> > > @@ -1079,6 +1181,10 @@ static void
> > xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo)
> > >
> > >  		dma_buf_unmap_attachment(ttm_bo->base.import_attach,
> > ttm_bo->sg,
> > >  					 DMA_BIDIRECTIONAL);
> > > +		if (bo->is_devmem_external && xe_tt->sg != ttm_bo->sg) {
> > > +			sg_free_table(xe_tt->sg);
> > > +			kfree(xe_tt->sg);
> > > +		}
> > >  		ttm_bo->sg = NULL;
> > >  		xe_tt->sg = NULL;
> > >  	}
> > > diff --git a/drivers/gpu/drm/xe/xe_bo_types.h
> > b/drivers/gpu/drm/xe/xe_bo_types.h
> > > index 8b9201775081..0fe619bc436d 100644
> > > --- a/drivers/gpu/drm/xe/xe_bo_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_bo_types.h
> > > @@ -67,6 +67,12 @@ struct xe_bo {
> > >  	/** @ccs_cleared */
> > >  	bool ccs_cleared;
> > >
> > > +	/**
> > > +	 * @is_devmem_external: Whether this BO is an imported dma-buf
> > that
> > > +	 * has a backing store in VRAM.
> > > +	 */
> > > +	bool is_devmem_external;
> > > +
> > >  	/**
> > >  	 * @cpu_caching: CPU caching mode. Currently only used for
> > userspace
> > >  	 * objects. Exceptions are system memory on DGFX, which is always
> > > --
> > > 2.45.1
> > >