[PATCH v3 7/7] dma-buf: system_heap: Add a system-uncached heap re-using the system heap

Thu Oct 8 11:51:01 UTC 2020

On Sat, Oct 03, 2020 at 04:02:57AM +0000, John Stultz wrote:
> This adds a heap that allocates non-contiguous buffers that are
> marked as writecombined, so they are not cached by the CPU.
> 
> This is useful, as most graphics buffers are usually not touched
> by the CPU or only written into once by the CPU. So when mapping
> the buffer over and over between devices, we can skip the CPU
> syncing, which saves a lot of cache management overhead, greatly
> improving performance.
> 
> For folk using ION, there was a ION_FLAG_CACHED flag, which
> signaled if the returned buffer should be CPU cacheable or not.
> With DMA-BUF heaps, we do not yet have such a flag, and by default
> the current heaps (system and cma) produce CPU cachable buffers.
> So for folks transitioning from ION to DMA-BUF Heaps, this fills
> in some of that missing functionality.
> 
> There has been a suggestion to make this functionality a flag
> (DMAHEAP_FLAG_UNCACHED?) on the system heap, similar to how
> ION used the ION_FLAG_CACHED. But I want to make sure an
> _UNCACHED flag would truely be a generic attribute across all
> heaps. So far that has been unclear, so having it as a separate
> heap seemes better for now. (But I'm open to discussion on this
> point!)
> 
> This is a rework of earlier efforts to add a uncached system heap,
> done utilizing the exisitng system heap, adding just a bit of
> logic to handle the uncached case.
> 
> Feedback would be very welcome!
> 
> Many thanks to Liam Mark for his help to get this working.
> 
> Pending opensource users of this code include:
> * AOSP HiKey960 gralloc:
>   - https://android-review.googlesource.com/c/device/linaro/hikey/+/1399519
>   - Visibly improves performance over the system heap
> * AOSP Codec2 (possibly, needs more review):
>   - https://android-review.googlesource.com/c/platform/frameworks/av/+/1360640/17/media/codec2/vndk/C2DmaBufAllocator.cpp#325
> 
> Cc: Sumit Semwal <sumit.semwal at linaro.org>
> Cc: Liam Mark <lmark at codeaurora.org>
> Cc: Laura Abbott <labbott at kernel.org>
> Cc: Brian Starkey <Brian.Starkey at arm.com>
> Cc: Hridya Valsaraju <hridya at google.com>
> Cc: Suren Baghdasaryan <surenb at google.com>
> Cc: Sandeep Patil <sspatil at google.com>
> Cc: Daniel Mentz <danielmentz at google.com>
> Cc: Chris Goldsworthy <cgoldswo at codeaurora.org>
> Cc: ï¿½rjan Eide <orjan.eide at arm.com>
> Cc: Robin Murphy <robin.murphy at arm.com>
> Cc: Ezequiel Garcia <ezequiel at collabora.com>
> Cc: Simon Ser <contact at emersion.fr>
> Cc: James Jones <jajones at nvidia.com>
> Cc: linux-media at vger.kernel.org
> Cc: dri-devel at lists.freedesktop.org
> Signed-off-by: John Stultz <john.stultz at linaro.org>
> ---
>  drivers/dma-buf/heaps/system_heap.c | 87 ++++++++++++++++++++++++++---
>  1 file changed, 79 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
> index 2b8d4b6abacb..952f1fd9dacf 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -22,6 +22,7 @@
>  #include <linux/vmalloc.h>
>  
>  struct dma_heap *sys_heap;
> +struct dma_heap *sys_uncached_heap;
>  
>  struct system_heap_buffer {
>  	struct dma_heap *heap;
> @@ -31,6 +32,8 @@ struct system_heap_buffer {
>  	struct sg_table sg_table;
>  	int vmap_cnt;
>  	void *vaddr;
> +
> +	bool uncached;
>  };
>  
>  struct dma_heap_attachment {
> @@ -38,6 +41,8 @@ struct dma_heap_attachment {
>  	struct sg_table *table;
>  	struct list_head list;
>  	bool mapped;
> +
> +	bool uncached;
>  };
>  
>  #define HIGH_ORDER_GFP  (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \
> @@ -94,7 +99,7 @@ static int system_heap_attach(struct dma_buf *dmabuf,
>  	a->dev = attachment->dev;
>  	INIT_LIST_HEAD(&a->list);
>  	a->mapped = false;
> -
> +	a->uncached = buffer->uncached;
>  	attachment->priv = a;
>  
>  	mutex_lock(&buffer->lock);
> @@ -124,9 +129,13 @@ static struct sg_table *system_heap_map_dma_buf(struct dma_buf_attachment *attac
>  {
>  	struct dma_heap_attachment *a = attachment->priv;
>  	struct sg_table *table = a->table;
> +	int attr = 0;
>  	int ret;
>  
> -	ret = dma_map_sgtable(attachment->dev, table, direction, 0);
> +	if (a->uncached)
> +		attr = DMA_ATTR_SKIP_CPU_SYNC;
> +
> +	ret = dma_map_sgtable(attachment->dev, table, direction, attr);
>  	if (ret)
>  		return ERR_PTR(ret);
>  
> @@ -139,9 +148,12 @@ static void system_heap_unmap_dma_buf(struct dma_buf_attachment *attachment,
>  				      enum dma_data_direction direction)
>  {
>  	struct dma_heap_attachment *a = attachment->priv;
> +	int attr = 0;
>  
> +	if (a->uncached)
> +		attr = DMA_ATTR_SKIP_CPU_SYNC;
>  	a->mapped = false;
> -	dma_unmap_sgtable(attachment->dev, table, direction, 0);
> +	dma_unmap_sgtable(attachment->dev, table, direction, attr);
>  }
>  
>  static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
> @@ -150,6 +162,9 @@ static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
>  	struct system_heap_buffer *buffer = dmabuf->priv;
>  	struct dma_heap_attachment *a;
>  
> +	if (buffer->uncached)
> +		return 0;
> +
>  	mutex_lock(&buffer->lock);
>  
>  	if (buffer->vmap_cnt)
> @@ -171,6 +186,9 @@ static int system_heap_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
>  	struct system_heap_buffer *buffer = dmabuf->priv;
>  	struct dma_heap_attachment *a;
>  
> +	if (buffer->uncached)
> +		return 0;
> +
>  	mutex_lock(&buffer->lock);
>  
>  	if (buffer->vmap_cnt)
> @@ -194,6 +212,9 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
>  	struct sg_page_iter piter;
>  	int ret;
>  
> +	if (buffer->uncached)
> +		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
> +
>  	for_each_sgtable_page(table, &piter, vma->vm_pgoff) {
>  		struct page *page = sg_page_iter_page(&piter);
>  
> @@ -215,8 +236,12 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer)
>  	struct page **pages = vmalloc(sizeof(struct page *) * npages);
>  	struct page **tmp = pages;
>  	struct sg_page_iter piter;
> +	pgprot_t pgprot = PAGE_KERNEL;
>  	void *vaddr;
>  
> +	if (buffer->uncached)
> +		pgprot = pgprot_writecombine(PAGE_KERNEL);

I think this should go after the 'if (!pages)' check. Best to get the
allocation failure check as close to the allocation as possible IMO.

> +
>  	if (!pages)
>  		return ERR_PTR(-ENOMEM);
>  
> @@ -225,7 +250,7 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer)
>  		*tmp++ = sg_page_iter_page(&piter);
>  	}
>  
> -	vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
> +	vaddr = vmap(pages, npages, VM_MAP, pgprot);
>  	vfree(pages);
>  
>  	if (!vaddr)
> @@ -278,6 +303,10 @@ static void system_heap_dma_buf_release(struct dma_buf *dmabuf)
>  	int i;
>  
>  	table = &buffer->sg_table;
> +	/* Unmap the uncached buffers from the heap device (pairs with the map on allocate) */
> +	if (buffer->uncached)
> +		dma_unmap_sgtable(dma_heap_get_dev(buffer->heap), table, DMA_BIDIRECTIONAL, 0);
> +
>  	for_each_sg(table->sgl, sg, table->nents, i) {
>  		struct page *page = sg_page(sg);
>  
> @@ -320,10 +349,11 @@ static struct page *alloc_largest_available(unsigned long size,
>  	return NULL;
>  }
>  
> -static int system_heap_allocate(struct dma_heap *heap,
> -				unsigned long len,
> -				unsigned long fd_flags,
> -				unsigned long heap_flags)
> +static int system_heap_do_allocate(struct dma_heap *heap,
> +				   unsigned long len,
> +				   unsigned long fd_flags,
> +				   unsigned long heap_flags,
> +				   bool uncached)
>  {
>  	struct system_heap_buffer *buffer;
>  	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> @@ -344,6 +374,7 @@ static int system_heap_allocate(struct dma_heap *heap,
>  	mutex_init(&buffer->lock);
>  	buffer->heap = heap;
>  	buffer->len = len;
> +	buffer->uncached = uncached;
>  
>  	INIT_LIST_HEAD(&pages);
>  	i = 0;
> @@ -393,6 +424,16 @@ static int system_heap_allocate(struct dma_heap *heap,
>  		/* just return, as put will call release and that will free */
>  		return ret;
>  	}
> +
> +	/*
> +	 * For uncached buffers, we need to initially flush cpu cache, since
> +	 * the __GFP_ZERO on the allocation means the zeroing was done by the
> +	 * cpu and thus it is likely cached. Map (and implicitly flush) it out
> +	 * now so we don't get corruption later on.
> +	 */
> +	if (buffer->uncached)
> +		dma_map_sgtable(dma_heap_get_dev(heap), table, DMA_BIDIRECTIONAL, 0);

Do we have to keep this mapping around for the entire lifetime of the
buffer?

Also, this problem (and solution) keeps lingering around. It really
feels like there should be a better way to solve "clean the linear
mapping all the way to DRAM", but I don't know what that should be.

> +
>  	return ret;
>  
>  free_pages:
> @@ -410,10 +451,30 @@ static int system_heap_allocate(struct dma_heap *heap,
>  	return ret;
>  }
>  
> +static int system_heap_allocate(struct dma_heap *heap,
> +				unsigned long len,
> +				unsigned long fd_flags,
> +				unsigned long heap_flags)
> +{
> +	return system_heap_do_allocate(heap, len, fd_flags, heap_flags, false);
> +}
> +
>  static const struct dma_heap_ops system_heap_ops = {
>  	.allocate = system_heap_allocate,
>  };
>  
> +static int system_uncached_heap_allocate(struct dma_heap *heap,
> +					 unsigned long len,
> +					 unsigned long fd_flags,
> +					 unsigned long heap_flags)
> +{
> +	return system_heap_do_allocate(heap, len, fd_flags, heap_flags, true);
> +}
> +
> +static const struct dma_heap_ops system_uncached_heap_ops = {
> +	.allocate = system_uncached_heap_allocate,
> +};
> +
>  static int system_heap_create(void)
>  {
>  	struct dma_heap_export_info exp_info;
> @@ -426,6 +487,16 @@ static int system_heap_create(void)
>  	if (IS_ERR(sys_heap))
>  		return PTR_ERR(sys_heap);
>  
> +	exp_info.name = "system-uncached";
> +	exp_info.ops = &system_uncached_heap_ops;
> +	exp_info.priv = NULL;
> +
> +	sys_uncached_heap = dma_heap_add(&exp_info);
> +	if (IS_ERR(sys_uncached_heap))
> +		return PTR_ERR(sys_heap);
> +

In principle, there's a race here between the heap getting registered
to sysfs and the dma_mask getting updated.

I don't think it would cause a problem in practice, but with the API
as it is, there's no way to avoid it - so I wonder if the
dma_heap_get_dev() accessor isn't the right API design.

Cheers,
-Brian

> +	dma_coerce_mask_and_coherent(dma_heap_get_dev(sys_uncached_heap), DMA_BIT_MASK(64));
> +
>  	return 0;
>  }
>  module_init(system_heap_create);
> -- 
> 2.17.1
>