[PATCH v4 7/7] dma-buf: system_heap: Add a system-uncached heap re-using the system heap
Hillf Danton
hdanton at sina.com
Thu Oct 29 07:10:37 UTC 2020
On Thu, 29 Oct 2020 00:16:24 +0000 John Stultz wrote:
>
> This adds a heap that allocates non-contiguous buffers that are
> marked as writecombined, so they are not cached by the CPU.
>
> This is useful, as most graphics buffers are usually not touched
> by the CPU or only written into once by the CPU. So when mapping
> the buffer over and over between devices, we can skip the CPU
> syncing, which saves a lot of cache management overhead, greatly
> improving performance.
>
> For folk using ION, there was a ION_FLAG_CACHED flag, which
> signaled if the returned buffer should be CPU cacheable or not.
> With DMA-BUF heaps, we do not yet have such a flag, and by default
> the current heaps (system and cma) produce CPU cachable buffers.
> So for folks transitioning from ION to DMA-BUF Heaps, this fills
> in some of that missing functionality.
>
> There has been a suggestion to make this functionality a flag
> (DMAHEAP_FLAG_UNCACHED?) on the system heap, similar to how
> ION used the ION_FLAG_CACHED. But I want to make sure an
> _UNCACHED flag would truely be a generic attribute across all
> heaps. So far that has been unclear, so having it as a separate
> heap seemes better for now. (But I'm open to discussion on this
> point!)
>
> This is a rework of earlier efforts to add a uncached system heap,
> done utilizing the exisitng system heap, adding just a bit of
> logic to handle the uncached case.
>
> Feedback would be very welcome!
>
> Many thanks to Liam Mark for his help to get this working.
>
> Pending opensource users of this code include:
> * AOSP HiKey960 gralloc:
> - https://android-review.googlesource.com/c/device/linaro/hikey/+/1399519
> - Visibly improves performance over the system heap
> * AOSP Codec2 (possibly, needs more review):
> - https://android-review.googlesource.com/c/platform/frameworks/av/+/1360640/17/media/codec2/vndk/C2DmaBufAllocator.cpp#325
>
> Cc: Sumit Semwal <sumit.semwal at linaro.org>
> Cc: Liam Mark <lmark at codeaurora.org>
> Cc: Laura Abbott <labbott at kernel.org>
> Cc: Brian Starkey <Brian.Starkey at arm.com>
> Cc: Hridya Valsaraju <hridya at google.com>
> Cc: Suren Baghdasaryan <surenb at google.com>
> Cc: Sandeep Patil <sspatil at google.com>
> Cc: Daniel Mentz <danielmentz at google.com>
> Cc: Chris Goldsworthy <cgoldswo at codeaurora.org>
> Cc: Ørjan Eide <orjan.eide at arm.com>
> Cc: Robin Murphy <robin.murphy at arm.com>
> Cc: Ezequiel Garcia <ezequiel at collabora.com>
> Cc: Simon Ser <contact at emersion.fr>
> Cc: James Jones <jajones at nvidia.com>
> Cc: linux-media at vger.kernel.org
> Cc: dri-devel at lists.freedesktop.org
> Signed-off-by: John Stultz <john.stultz at linaro.org>
> ---
> v4:
> * Make sys_uncached_heap static, as
> Reported-by: kernel test robot <lkp at intel.com>
> * Fix wrong return value, caught by smatch
> Reported-by: kernel test robot <lkp at intel.com>
> Reported-by: Dan Carpenter <dan.carpenter at oracle.com>
> * Ensure we call flush/invalidate_kernel_vmap_range() in the
> uncached cases to try to address feedback about VIVT caches
> from Christoph
> * Reorder a few lines as suggested by BrianS
> * Avoid holding the initial mapping for the lifetime of the buffer
> as suggested by BrianS
> * Fix a unlikely race between allocate and updating the dma_mask
> that BrianS noticed.
> ---
> drivers/dma-buf/heaps/system_heap.c | 111 ++++++++++++++++++++++++----
> 1 file changed, 95 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
> index ef4b2c1032df..a328c76249d2 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -22,6 +22,7 @@
> #include <linux/vmalloc.h>
>
> static struct dma_heap *sys_heap;
> +static struct dma_heap *sys_uncached_heap;
>
> struct system_heap_buffer {
> struct dma_heap *heap;
> @@ -31,6 +32,8 @@ struct system_heap_buffer {
> struct sg_table sg_table;
> int vmap_cnt;
> void *vaddr;
> +
> + bool uncached;
> };
>
> struct dma_heap_attachment {
> @@ -38,6 +41,8 @@ struct dma_heap_attachment {
> struct sg_table *table;
> struct list_head list;
> bool mapped;
> +
> + bool uncached;
> };
>
> #define HIGH_ORDER_GFP (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \
> @@ -94,7 +99,7 @@ static int system_heap_attach(struct dma_buf *dmabuf,
> a->dev = attachment->dev;
> INIT_LIST_HEAD(&a->list);
> a->mapped = false;
> -
> + a->uncached = buffer->uncached;
> attachment->priv = a;
>
> mutex_lock(&buffer->lock);
> @@ -124,9 +129,13 @@ static struct sg_table *system_heap_map_dma_buf(struct dma_buf_attachment *attac
> {
> struct dma_heap_attachment *a = attachment->priv;
> struct sg_table *table = a->table;
> + int attr = 0;
> int ret;
>
> - ret = dma_map_sgtable(attachment->dev, table, direction, 0);
> + if (a->uncached)
> + attr = DMA_ATTR_SKIP_CPU_SYNC;
> +
> + ret = dma_map_sgtable(attachment->dev, table, direction, attr);
> if (ret)
> return ERR_PTR(ret);
>
> @@ -139,9 +148,12 @@ static void system_heap_unmap_dma_buf(struct dma_buf_attachment *attachment,
> enum dma_data_direction direction)
> {
> struct dma_heap_attachment *a = attachment->priv;
> + int attr = 0;
>
> + if (a->uncached)
> + attr = DMA_ATTR_SKIP_CPU_SYNC;
> a->mapped = false;
> - dma_unmap_sgtable(attachment->dev, table, direction, 0);
> + dma_unmap_sgtable(attachment->dev, table, direction, attr);
> }
>
> static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
> @@ -155,10 +167,12 @@ static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
> if (buffer->vmap_cnt)
> invalidate_kernel_vmap_range(buffer->vaddr, buffer->len);
>
> - list_for_each_entry(a, &buffer->attachments, list) {
> - if (!a->mapped)
> - continue;
> - dma_sync_sgtable_for_cpu(a->dev, a->table, direction);
> + if (!buffer->uncached) {
> + list_for_each_entry(a, &buffer->attachments, list) {
> + if (!a->mapped)
> + continue;
> + dma_sync_sgtable_for_cpu(a->dev, a->table, direction);
> + }
> }
> mutex_unlock(&buffer->lock);
>
> @@ -176,10 +190,12 @@ static int system_heap_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
> if (buffer->vmap_cnt)
> flush_kernel_vmap_range(buffer->vaddr, buffer->len);
>
> - list_for_each_entry(a, &buffer->attachments, list) {
> - if (!a->mapped)
> - continue;
> - dma_sync_sgtable_for_device(a->dev, a->table, direction);
> + if (!buffer->uncached) {
> + list_for_each_entry(a, &buffer->attachments, list) {
> + if (!a->mapped)
> + continue;
> + dma_sync_sgtable_for_device(a->dev, a->table, direction);
> + }
> }
> mutex_unlock(&buffer->lock);
>
> @@ -194,6 +210,9 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> struct sg_page_iter piter;
> int ret;
>
> + if (buffer->uncached)
> + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
> +
Wonder why you turn back to dma_mmap_wc() and friends?
> for_each_sgtable_page(table, &piter, vma->vm_pgoff) {
> struct page *page = sg_page_iter_page(&piter);
>
> @@ -215,17 +234,21 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer)
> struct page **pages = vmalloc(sizeof(struct page *) * npages);
> struct page **tmp = pages;
> struct sg_page_iter piter;
> + pgprot_t pgprot = PAGE_KERNEL;
> void *vaddr;
>
> if (!pages)
> return ERR_PTR(-ENOMEM);
>
> + if (buffer->uncached)
> + pgprot = pgprot_writecombine(PAGE_KERNEL);
> +
> for_each_sgtable_page(table, &piter, 0) {
> WARN_ON(tmp - pages >= npages);
> *tmp++ = sg_page_iter_page(&piter);
> }
>
> - vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
> + vaddr = vmap(pages, npages, VM_MAP, pgprot);
> vfree(pages);
>
> if (!vaddr)
> @@ -320,10 +343,11 @@ static struct page *alloc_largest_available(unsigned long size,
> return NULL;
> }
>
> -static int system_heap_allocate(struct dma_heap *heap,
> - unsigned long len,
> - unsigned long fd_flags,
> - unsigned long heap_flags)
> +static int system_heap_do_allocate(struct dma_heap *heap,
> + unsigned long len,
> + unsigned long fd_flags,
> + unsigned long heap_flags,
> + bool uncached)
> {
> struct system_heap_buffer *buffer;
> DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> @@ -344,6 +368,7 @@ static int system_heap_allocate(struct dma_heap *heap,
> mutex_init(&buffer->lock);
> buffer->heap = heap;
> buffer->len = len;
> + buffer->uncached = uncached;
>
> INIT_LIST_HEAD(&pages);
> i = 0;
> @@ -393,6 +418,18 @@ static int system_heap_allocate(struct dma_heap *heap,
> /* just return, as put will call release and that will free */
> return ret;
> }
> +
> + /*
> + * For uncached buffers, we need to initially flush cpu cache, since
> + * the __GFP_ZERO on the allocation means the zeroing was done by the
> + * cpu and thus it is likely cached. Map (and implicitly flush) and
> + * unmap it now so we don't get corruption later on.
> + */
> + if (buffer->uncached) {
> + dma_map_sgtable(dma_heap_get_dev(heap), table, DMA_BIDIRECTIONAL, 0);
> + dma_unmap_sgtable(dma_heap_get_dev(heap), table, DMA_BIDIRECTIONAL, 0);
> + }
> +
> return ret;
>
> free_pages:
> @@ -410,10 +447,40 @@ static int system_heap_allocate(struct dma_heap *heap,
> return ret;
> }
>
> +static int system_heap_allocate(struct dma_heap *heap,
> + unsigned long len,
> + unsigned long fd_flags,
> + unsigned long heap_flags)
> +{
> + return system_heap_do_allocate(heap, len, fd_flags, heap_flags, false);
> +}
> +
> static const struct dma_heap_ops system_heap_ops = {
> .allocate = system_heap_allocate,
> };
>
> +static int system_uncached_heap_allocate(struct dma_heap *heap,
> + unsigned long len,
> + unsigned long fd_flags,
> + unsigned long heap_flags)
> +{
> + return system_heap_do_allocate(heap, len, fd_flags, heap_flags, true);
> +}
> +
> +/* Dummy function to be used until we can call coerce_mask_and_coherent */
> +static int system_uncached_heap_not_initialized(struct dma_heap *heap,
> + unsigned long len,
> + unsigned long fd_flags,
> + unsigned long heap_flags)
> +{
> + return -EBUSY;
> +}
> +
> +static struct dma_heap_ops system_uncached_heap_ops = {
> + /* After system_heap_create is complete, we will swap this */
> + .allocate = system_uncached_heap_not_initialized,
> +};
> +
> static int system_heap_create(void)
> {
> struct dma_heap_export_info exp_info;
> @@ -426,6 +493,18 @@ static int system_heap_create(void)
> if (IS_ERR(sys_heap))
> return PTR_ERR(sys_heap);
>
> + exp_info.name = "system-uncached";
> + exp_info.ops = &system_uncached_heap_ops;
> + exp_info.priv = NULL;
> +
> + sys_uncached_heap = dma_heap_add(&exp_info);
> + if (IS_ERR(sys_uncached_heap))
> + return PTR_ERR(sys_uncached_heap);
> +
> + dma_coerce_mask_and_coherent(dma_heap_get_dev(sys_uncached_heap), DMA_BIT_MASK(64));
> + mb(); /* make sure we only set allocate after dma_mask is set */
> + system_uncached_heap_ops.allocate = system_uncached_heap_allocate;
> +
> return 0;
> }
> module_init(system_heap_create);
> --
> 2.17.1
More information about the dri-devel
mailing list