[PATCH v2 2/2] drm/nouveau/gsp: Use the sg allocator for level 2 of radix3

Sun Apr 28 16:36:34 UTC 2024

On 30/4/24 04:23, Lyude Paul wrote:

> Currently we allocate all 3 levels of radix3 page tables using
> nvkm_gsp_mem_ctor(), which uses dma_alloc_coherent() for allocating all of
> the relevant memory. This can end up failing in scenarios where the system
> has very high memory fragmentation, and we can't find enough contiguous
> memory to allocate level 2 of the page table.
>
> Currently, this can result in runtime PM issues on systems where memory
> fragmentation is high - as we'll fail to allocate the page table for our
> suspend/resume buffer:
>
>    kworker/10:2: page allocation failure: order:7, mode:0xcc0(GFP_KERNEL),
>    nodemask=(null),cpuset=/,mems_allowed=0
>    CPU: 10 PID: 479809 Comm: kworker/10:2 Not tainted
>    6.8.6-201.ChopperV6.fc39.x86_64 #1
>    Hardware name: SLIMBOOK Executive/Executive, BIOS N.1.10GRU06 02/02/2024
>    Workqueue: pm pm_runtime_work
>    Call Trace:
>     <TASK>
>     dump_stack_lvl+0x64/0x80
>     warn_alloc+0x165/0x1e0
>     ? __alloc_pages_direct_compact+0xb3/0x2b0
>     __alloc_pages_slowpath.constprop.0+0xd7d/0xde0
>     __alloc_pages+0x32d/0x350
>     __dma_direct_alloc_pages.isra.0+0x16a/0x2b0
>     dma_direct_alloc+0x70/0x270
>     nvkm_gsp_radix3_sg+0x5e/0x130 [nouveau]
>     r535_gsp_fini+0x1d4/0x350 [nouveau]
>     nvkm_subdev_fini+0x67/0x150 [nouveau]
>     nvkm_device_fini+0x95/0x1e0 [nouveau]
>     nvkm_udevice_fini+0x53/0x70 [nouveau]
>     nvkm_object_fini+0xb9/0x240 [nouveau]
>     nvkm_object_fini+0x75/0x240 [nouveau]
>     nouveau_do_suspend+0xf5/0x280 [nouveau]
>     nouveau_pmops_runtime_suspend+0x3e/0xb0 [nouveau]
>     pci_pm_runtime_suspend+0x67/0x1e0
>     ? __pfx_pci_pm_runtime_suspend+0x10/0x10
>     __rpm_callback+0x41/0x170
>     ? __pfx_pci_pm_runtime_suspend+0x10/0x10
>     rpm_callback+0x5d/0x70
>     ? __pfx_pci_pm_runtime_suspend+0x10/0x10
>     rpm_suspend+0x120/0x6a0
>     pm_runtime_work+0x98/0xb0
>     process_one_work+0x171/0x340
>     worker_thread+0x27b/0x3a0
>     ? __pfx_worker_thread+0x10/0x10
>     kthread+0xe5/0x120
>     ? __pfx_kthread+0x10/0x10
>     ret_from_fork+0x31/0x50
>     ? __pfx_kthread+0x10/0x10
>     ret_from_fork_asm+0x1b/0x30
>
> Luckily, we don't actually need to allocate coherent memory for the page
> table thanks to being able to pass the GPU a radix3 page table for
> suspend/resume data. So, let's rewrite nvkm_gsp_radix3_sg() to use the sg
> allocator for level 2. We continue using coherent allocations for lvl0 and
> 1, since they only take a single page.
>
> V2:
> * Don't forget to actually jump to the next scatterlist when we reach the
>    end of the scatterlist we're currently on when writing out the page table
>    for level 2
>
> Signed-off-by: Lyude Paul <lyude at redhat.com>
> Cc: stable at vger.kernel.org

Hey Lyude,

These are looking good!  Thank you for looking at this issue. For both 
patches:

Reviewed-by: Ben Skeggs <bskeggs at nvidia.com>

Ben.

> ---
>   .../gpu/drm/nouveau/include/nvkm/subdev/gsp.h |  4 +-
>   .../gpu/drm/nouveau/nvkm/subdev/gsp/r535.c    | 77 ++++++++++++-------
>   2 files changed, 54 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h
> index 6f5d376d8fcc1..a11d16a16c3b2 100644
> --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h
> +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h
> @@ -15,7 +15,9 @@ struct nvkm_gsp_mem {
>   };
>   
>   struct nvkm_gsp_radix3 {
> -	struct nvkm_gsp_mem mem[3];
> +	struct nvkm_gsp_mem lvl0;
> +	struct nvkm_gsp_mem lvl1;
> +	struct sg_table lvl2;
>   };
>   
>   int nvkm_gsp_sg(struct nvkm_device *, u64 size, struct sg_table *);
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
> index 9858c1438aa7f..fd4e80ba6adfc 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
> @@ -1624,7 +1624,7 @@ r535_gsp_wpr_meta_init(struct nvkm_gsp *gsp)
>   	meta->magic = GSP_FW_WPR_META_MAGIC;
>   	meta->revision = GSP_FW_WPR_META_REVISION;
>   
> -	meta->sysmemAddrOfRadix3Elf = gsp->radix3.mem[0].addr;
> +	meta->sysmemAddrOfRadix3Elf = gsp->radix3.lvl0.addr;
>   	meta->sizeOfRadix3Elf = gsp->fb.wpr2.elf.size;
>   
>   	meta->sysmemAddrOfBootloader = gsp->boot.fw.addr;
> @@ -1919,8 +1919,9 @@ nvkm_gsp_sg(struct nvkm_device *device, u64 size, struct sg_table *sgt)
>   static void
>   nvkm_gsp_radix3_dtor(struct nvkm_gsp *gsp, struct nvkm_gsp_radix3 *rx3)
>   {
> -	for (int i = ARRAY_SIZE(rx3->mem) - 1; i >= 0; i--)
> -		nvkm_gsp_mem_dtor(gsp, &rx3->mem[i]);
> +	nvkm_gsp_sg_free(gsp->subdev.device, &rx3->lvl2);
> +	nvkm_gsp_mem_dtor(gsp, &rx3->lvl1);
> +	nvkm_gsp_mem_dtor(gsp, &rx3->lvl0);
>   }
>   
>   /**
> @@ -1960,36 +1961,60 @@ static int
>   nvkm_gsp_radix3_sg(struct nvkm_gsp *gsp, struct sg_table *sgt, u64 size,
>   		   struct nvkm_gsp_radix3 *rx3)
>   {
> -	u64 addr;
> +	struct sg_dma_page_iter sg_dma_iter;
> +	struct scatterlist *sg;
> +	size_t bufsize;
> +	u64 *pte;
> +	int ret, i, page_idx = 0;
>   
> -	for (int i = ARRAY_SIZE(rx3->mem) - 1; i >= 0; i--) {
> -		u64 *ptes;
> -		size_t bufsize;
> -		int ret, idx;
> +	ret = nvkm_gsp_mem_ctor(gsp, GSP_PAGE_SIZE, &rx3->lvl0);
> +	if (ret)
> +		return ret;
>   
> -		bufsize = ALIGN((size / GSP_PAGE_SIZE) * sizeof(u64), GSP_PAGE_SIZE);
> -		ret = nvkm_gsp_mem_ctor(gsp, bufsize, &rx3->mem[i]);
> -		if (ret)
> -			return ret;
> +	ret = nvkm_gsp_mem_ctor(gsp, GSP_PAGE_SIZE, &rx3->lvl1);
> +	if (ret)
> +		goto lvl1_fail;
>   
> -		ptes = rx3->mem[i].data;
> -		if (i == 2) {
> -			struct scatterlist *sgl;
> +	// Allocate level 2
> +	bufsize = ALIGN((size / GSP_PAGE_SIZE) * sizeof(u64), GSP_PAGE_SIZE);
> +	ret = nvkm_gsp_sg(gsp->subdev.device, bufsize, &rx3->lvl2);
> +	if (ret)
> +		goto lvl2_fail;
>   
> -			for_each_sgtable_dma_sg(sgt, sgl, idx) {
> -				for (int j = 0; j < sg_dma_len(sgl) / GSP_PAGE_SIZE; j++)
> -					*ptes++ = sg_dma_address(sgl) + (GSP_PAGE_SIZE * j);
> -			}
> -		} else {
> -			for (int j = 0; j < size / GSP_PAGE_SIZE; j++)
> -				*ptes++ = addr + GSP_PAGE_SIZE * j;
> +	// Write the bus address of level 1 to level 0
> +	pte = rx3->lvl0.data;
> +	*pte = rx3->lvl1.addr;
> +
> +	// Write the bus address of each page in level 2 to level 1
> +	pte = rx3->lvl1.data;
> +	for_each_sgtable_dma_page(&rx3->lvl2, &sg_dma_iter, 0)
> +		*pte++ = sg_page_iter_dma_address(&sg_dma_iter);
> +
> +	// Finally, write the bus address of each page in sgt to level 2
> +	for_each_sgtable_sg(&rx3->lvl2, sg, i) {
> +		void *sgl_end;
> +
> +		pte = sg_virt(sg);
> +		sgl_end = (void*)pte + sg->length;
> +
> +		for_each_sgtable_dma_page(sgt, &sg_dma_iter, page_idx) {
> +			*pte++ = sg_page_iter_dma_address(&sg_dma_iter);
> +			page_idx++;
> +
> +			// Go to the next scatterlist for level 2 if we've reached the end
> +			if ((void*)pte >= sgl_end)
> +				break;
>   		}
> +	}
>   
> -		size = rx3->mem[i].size;
> -		addr = rx3->mem[i].addr;
> +	if (ret) {
> +lvl2_fail:
> +		nvkm_gsp_mem_dtor(gsp, &rx3->lvl1);
> +lvl1_fail:
> +		nvkm_gsp_mem_dtor(gsp, &rx3->lvl0);
>   	}
>   
> -	return 0;
> +	return ret;
>   }
>   
>   int
> @@ -2021,7 +2046,7 @@ r535_gsp_fini(struct nvkm_gsp *gsp, bool suspend)
>   		sr = gsp->sr.meta.data;
>   		sr->magic = GSP_FW_SR_META_MAGIC;
>   		sr->revision = GSP_FW_SR_META_REVISION;
> -		sr->sysmemAddrOfSuspendResumeData = gsp->sr.radix3.mem[0].addr;
> +		sr->sysmemAddrOfSuspendResumeData = gsp->sr.radix3.lvl0.addr;
>   		sr->sizeOfSuspendResumeData = len;
>   
>   		mbox0 = lower_32_bits(gsp->sr.meta.addr);