[Intel-gfx] [PATCH 5/7] drm/i915/ttm, drm/ttm: Add a generic TTM memcpy move for page-based iomem
Jani Nikula
jani.nikula at linux.intel.com
Mon May 17 10:57:10 UTC 2021
On Tue, 11 May 2021, Thomas Hellström <thomas.hellstrom at linux.intel.com> wrote:
> The internal ttm_bo_util memcpy uses vmap functionality, and while it
> probably might be possible to use it for copying in- and out of
> sglist represented io memory, using io_mem_reserve() / io_mem_free()
> callbacks, that would cause problems with fault().
> Instead, implement a method mapping page-by-page using kmap_local()
> semantics. As an additional benefit we then avoid the occasional global
> TLB flushes of vmap() and consuming vmap space, elimination of a critical
> point of failure and with a slight change of semantics we could also push
> the memcpy out async for testing and async driver develpment purposes.
> Pushing out async can be done since there is no memory allocation going on
> that could violate the dma_fence lockdep rules.
>
> Note that drivers that don't want to use struct io_mapping but relies on
> memremap functionality, and that don't want to use scatterlists for
> VRAM may well define specialized (hopefully reusable) iterators for their
> particular environment.
>
> Cc: Christian König <christian.koenig at amd.com>
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> ---
> drivers/gpu/drm/i915/Makefile | 1 +
> .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.c | 155 ++++++++++++++++++
> .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.h | 141 ++++++++++++++++
> drivers/gpu/drm/ttm/ttm_bo.c | 1 +
> 4 files changed, 298 insertions(+)
> create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
> create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index cb8823570996..958ccc1edfed 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -155,6 +155,7 @@ gem-y += \
> gem/i915_gem_stolen.o \
> gem/i915_gem_throttle.o \
> gem/i915_gem_tiling.o \
> + gem/i915_gem_ttm_bo_util.o \
> gem/i915_gem_userptr.o \
> gem/i915_gem_wait.o \
> gem/i915_gemfs.o
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
> new file mode 100644
> index 000000000000..1116d7df1461
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
> @@ -0,0 +1,155 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +/**
> + * DOC: Usage and intentions.
> + *
> + * This file contains functionality that we might want to move into
> + * ttm_bo_util.c if there is a common interest.
> + * Currently a kmap_local only memcpy with support for page-based iomem regions,
> + * and fast memcpy from write-combined memory.
> + */
> +
> +#include <linux/dma-buf-map.h>
> +#include <linux/highmem.h>
> +#include <linux/io-mapping.h>
> +#include <linux/scatterlist.h>
> +
> +#include "i915_memcpy.h"
> +
> +#include "gem/i915_gem_ttm_bo_util.h"
> +
> +static void i915_ttm_kmap_iter_tt_kmap_local(struct i915_ttm_kmap_iter *iter,
> + struct dma_buf_map *dmap,
> + pgoff_t i)
> +{
> + struct i915_ttm_kmap_iter_tt *iter_tt =
> + container_of(iter, typeof(*iter_tt), base);
> +
> + dma_buf_map_set_vaddr(dmap, kmap_local_page(iter_tt->tt->pages[i]));
> +}
> +
> +static void i915_ttm_kmap_iter_iomap_kmap_local(struct i915_ttm_kmap_iter *iter,
> + struct dma_buf_map *dmap,
> + pgoff_t i)
> +{
> + struct i915_ttm_kmap_iter_iomap *iter_io =
> + container_of(iter, typeof(*iter_io), base);
> + void __iomem *addr;
> +
> +retry:
> + while (i >= iter_io->cache.end) {
> + iter_io->cache.sg = iter_io->cache.sg ?
> + sg_next(iter_io->cache.sg) : iter_io->st->sgl;
> + iter_io->cache.i = iter_io->cache.end;
> + iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >>
> + PAGE_SHIFT;
> + iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) -
> + iter_io->start;
> + }
> +
> + if (i < iter_io->cache.i) {
> + iter_io->cache.end = 0;
> + iter_io->cache.sg = NULL;
> + goto retry;
> + }
> +
> + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs +
> + (((resource_size_t)i - iter_io->cache.i)
> + << PAGE_SHIFT));
> + dma_buf_map_set_vaddr_iomem(dmap, addr);
> +}
> +
> +struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_tt_ops = {
> + .kmap_local = i915_ttm_kmap_iter_tt_kmap_local
> +};
> +
> +struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_io_ops = {
> + .kmap_local = i915_ttm_kmap_iter_iomap_kmap_local
> +};
> +
> +static void kunmap_local_dma_buf_map(struct dma_buf_map *map)
> +{
> + if (map->is_iomem)
> + io_mapping_unmap_local(map->vaddr_iomem);
> + else
> + kunmap_local(map->vaddr);
> +}
> +
> +/**
> + * i915_ttm_move_memcpy - Helper to perform a memcpy ttm move operation.
> + * @bo: The struct ttm_buffer_object.
> + * @new_mem: The struct ttm_resource we're moving to (copy destination).
> + * @new_kmap: A struct i915_ttm_kmap_iter representing the destination resource.
> + * @old_kmap: A struct i915_ttm_kmap_iter representing the source resource.
> + */
> +void i915_ttm_move_memcpy(struct ttm_buffer_object *bo,
> + struct ttm_resource *new_mem,
> + struct i915_ttm_kmap_iter *new_kmap,
> + struct i915_ttm_kmap_iter *old_kmap)
> +{
> + struct ttm_device *bdev = bo->bdev;
> + struct ttm_resource_manager *man = ttm_manager_type(bdev, new_mem->mem_type);
> + struct ttm_tt *ttm = bo->ttm;
> + struct ttm_resource *old_mem = &bo->mem;
> + struct ttm_resource old_copy = *old_mem;
> + struct ttm_resource_manager *old_man = ttm_manager_type(bdev, old_mem->mem_type);
> + struct dma_buf_map old_map, new_map;
> + pgoff_t i;
> +
> + /* For the page-based allocator we need sgtable iterators as well.*/
> +
> + /* Single TTM move. NOP */
> + if (old_man->use_tt && man->use_tt)
> + goto done;
> +
> + /* Don't move nonexistent data. Clear destination instead. */
> + if (old_man->use_tt && !man->use_tt &&
> + (ttm == NULL || !ttm_tt_is_populated(ttm))) {
> + if (ttm && !(ttm->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC))
> + goto done;
> +
> + for (i = 0; i < new_mem->num_pages; ++i) {
> + new_kmap->ops->kmap_local(new_kmap, &new_map, i);
> + memset_io(new_map.vaddr_iomem, 0, PAGE_SIZE);
> + kunmap_local_dma_buf_map(&new_map);
> + }
> + goto done;
> + }
> +
> + for (i = 0; i < new_mem->num_pages; ++i) {
> + new_kmap->ops->kmap_local(new_kmap, &new_map, i);
> + old_kmap->ops->kmap_local(old_kmap, &old_map, i);
> + if (!old_map.is_iomem ||
> + !i915_memcpy_from_wc(new_map.vaddr, old_map.vaddr, PAGE_SIZE)) {
> + if (!old_map.is_iomem) {
> + dma_buf_map_memcpy_to(&new_map, old_map.vaddr,
> + PAGE_SIZE);
> + } else if (!new_map.is_iomem) {
> + memcpy_fromio(new_map.vaddr, old_map.vaddr_iomem,
> + PAGE_SIZE);
> + } else {
> + pgoff_t j;
> + u32 __iomem *src = old_map.vaddr_iomem;
> + u32 __iomem *dst = new_map.vaddr_iomem;
> +
> + for (j = 0; j < (PAGE_SIZE >> 2); ++j)
> + iowrite32(ioread32(src++), dst++);
> + }
> + }
> + kunmap_local_dma_buf_map(&old_map);
> + kunmap_local_dma_buf_map(&new_map);
> + }
> +
> +done:
> + old_copy = *old_mem;
> +
> + ttm_bo_assign_mem(bo, new_mem);
> +
> + if (!man->use_tt)
> + ttm_bo_tt_destroy(bo);
> +
> + ttm_resource_free(bo, &old_copy);
> +}
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h
> new file mode 100644
> index 000000000000..82c92176718d
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h
> @@ -0,0 +1,141 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +/*
> + * This files contains functionality that we might want to move into
> + * ttm_bo_util.c if there is a common interest.
> + */
> +#ifndef _I915_GEM_TTM_BO_UTIL_H_
> +#define _I915_GEM_TTM_BO_UTIL_H_
> +
> +#include <drm/ttm/ttm_bo_driver.h>
> +struct dma_buf_map;
> +struct io_mapping;
> +struct sg_table;
> +struct scatterlist;
> +
> +struct ttm_tt;
> +struct i915_ttm_kmap_iter;
> +
> +/**
> + * struct i915_ttm_kmap_iter_ops - Ops structure for a struct
> + * i915_ttm_kmap_iter.
> + */
> +struct i915_ttm_kmap_iter_ops {
> + /**
> + * kmap_local - Map a PAGE_SIZE part of the resource using
> + * kmap_local semantics.
> + * @res_kmap: Pointer to the struct i915_ttm_kmap_iter representing
> + * the resource.
> + * @dmap: The struct dma_buf_map holding the virtual address after
> + * the operation.
> + * @i: The location within the resource to map. PAGE_SIZE granularity.
> + */
> + void (*kmap_local)(struct i915_ttm_kmap_iter *res_kmap,
> + struct dma_buf_map *dmap, pgoff_t i);
> +};
> +
> +/**
> + * struct i915_ttm_kmap_iter - Iterator for kmap_local type operations on a
> + * resource.
> + * @ops: Pointer to the operations struct.
> + *
> + * This struct is intended to be embedded in a resource-specific specialization
> + * implementing operations for the resource.
> + *
> + * Nothing stops us from extending the operations to vmap, vmap_pfn etc,
> + * replacing some or parts of the ttm_bo_util. cpu-map functionality.
> + */
> +struct i915_ttm_kmap_iter {
> + const struct i915_ttm_kmap_iter_ops *ops;
> +};
> +
> +/**
> + * struct i915_ttm_kmap_iter_tt - Specialization for a tt (page) backed struct
> + * ttm_resource.
> + * @base: Embedded struct i915_ttm_kmap_iter providing the usage interface
> + * @tt: Cached struct ttm_tt.
> + */
> +struct i915_ttm_kmap_iter_tt {
> + struct i915_ttm_kmap_iter base;
> + struct ttm_tt *tt;
> +};
> +
> +/**
> + * struct i915_ttm_kmap_iter_iomap - Specialization for a struct io_mapping +
> + * struct sg_table backed struct ttm_resource.
> + * @base: Embedded struct i915_ttm_kmap_iter providing the usage interface.
> + * @iomap: struct io_mapping representing the underlying linear io_memory.
> + * @st: sg_table into @iomap, representing the memory of the struct ttm_resource.
> + * @start: Offset that needs to be subtracted from @st to make
> + * sg_dma_address(st->sgl) - @start == 0 for @iomap start.
> + * @cache: Scatterlist traversal cache for fast lookups.
> + * @cache.sg: Pointer to the currently cached scatterlist segment.
> + * @cache.i: First index of @sg. PAGE_SIZE granularity.
> + * @cache.end: Last index + 1 of @sg. PAGE_SIZE granularity.
> + * @cache.offs: First offset into @iomap of @sg. PAGE_SIZE granularity.
> + */
> +struct i915_ttm_kmap_iter_iomap {
> + struct i915_ttm_kmap_iter base;
> + struct io_mapping *iomap;
> + struct sg_table *st;
> + resource_size_t start;
> + struct {
> + struct scatterlist *sg;
> + pgoff_t i;
> + pgoff_t end;
> + pgoff_t offs;
> + } cache;
> +};
> +
> +extern struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_tt_ops;
> +extern struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_io_ops;
> +
> +/**
> + * i915_ttm_kmap_iter_iomap_init - Initialize a struct i915_ttm_kmap_iter_iomap
> + * @iter_io: The struct i915_ttm_kmap_iter_iomap to initialize.
> + * @iomap: The struct io_mapping representing the underlying linear io_memory.
> + * @st: sg_table into @iomap, representing the memory of the struct
> + * ttm_resource.
> + * @start: Offset that needs to be subtracted from @st to make
> + * sg_dma_address(st->sgl) - @start == 0 for @iomap start.
> + *
> + * Return: Pointer to the embedded struct i915_ttm_kmap_iter.
> + */
> +static inline struct i915_ttm_kmap_iter *
> +i915_ttm_kmap_iter_iomap_init(struct i915_ttm_kmap_iter_iomap *iter_io,
> + struct io_mapping *iomap,
> + struct sg_table *st,
> + resource_size_t start)
> +{
> + iter_io->base.ops = &i915_ttm_kmap_iter_io_ops;
> + iter_io->iomap = iomap;
> + iter_io->st = st;
> + iter_io->start = start;
> + memset(&iter_io->cache, 0, sizeof(iter_io->cache));
> + return &iter_io->base;
> +}
> +
> +/**
> + * ttm_kmap_iter_tt_init - Initialize a struct i915_ttm_kmap_iter_tt
> + * @iter_tt: The struct i915_ttm_kmap_iter_tt to initialize.
> + * @tt: Struct ttm_tt holding page pointers of the struct ttm_resource.
> + *
> + * Return: Pointer to the embedded struct i915_ttm_kmap_iter.
> + */
> +static inline struct i915_ttm_kmap_iter *
> +i915_ttm_kmap_iter_tt_init(struct i915_ttm_kmap_iter_tt *iter_tt,
> + struct ttm_tt *tt)
> +{
> + iter_tt->base.ops = &i915_ttm_kmap_iter_tt_ops;
> + iter_tt->tt = tt;
> + return &iter_tt->base;
> +}
Do there functions have a valid *performance* reason to be inline? I
think that's pretty much the only valid reason.
Having these inline forces i915_ttm_kmap_iter_*_ops extern, and they
should really be static. Inline functions complicate header dependencies
and leak the abstractions.
BR,
Jani.
> +
> +void i915_ttm_move_memcpy(struct ttm_buffer_object *bo,
> + struct ttm_resource *new_mem,
> + struct i915_ttm_kmap_iter *new_iter,
> + struct i915_ttm_kmap_iter *old_iter);
> +#endif
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index ca1b098b6a56..4479c55aaa1d 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -1221,3 +1221,4 @@ void ttm_bo_tt_destroy(struct ttm_buffer_object *bo)
> ttm_tt_destroy(bo->bdev, bo->ttm);
> bo->ttm = NULL;
> }
> +EXPORT_SYMBOL(ttm_bo_tt_destroy);
--
Jani Nikula, Intel Open Source Graphics Center
More information about the dri-devel
mailing list