[Intel-gfx] [PATCH 5/7] drm/i915/ttm, drm/ttm: Add a generic TTM memcpy move for page-based iomem

Tue May 18 06:53:50 UTC 2021

On 5/17/21 12:57 PM, Jani Nikula wrote:
> On Tue, 11 May 2021, Thomas Hellström <thomas.hellstrom at linux.intel.com> wrote:
>> The internal ttm_bo_util memcpy uses vmap functionality, and while it
>> probably might be possible to use it for copying in- and out of
>> sglist represented io memory, using io_mem_reserve() / io_mem_free()
>> callbacks, that would cause problems with fault().
>> Instead, implement a method mapping page-by-page using kmap_local()
>> semantics. As an additional benefit we then avoid the occasional global
>> TLB flushes of vmap() and consuming vmap space, elimination of a critical
>> point of failure and with a slight change of semantics we could also push
>> the memcpy out async for testing and async driver develpment purposes.
>> Pushing out async can be done since there is no memory allocation going on
>> that could violate the dma_fence lockdep rules.
>>
>> Note that drivers that don't want to use struct io_mapping but relies on
>> memremap functionality, and that don't want to use scatterlists for
>> VRAM may well define specialized (hopefully reusable) iterators for their
>> particular environment.
>>
>> Cc: Christian König <christian.koenig at amd.com>
>> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
>> ---
>>   drivers/gpu/drm/i915/Makefile                 |   1 +
>>   .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.c   | 155 ++++++++++++++++++
>>   .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.h   | 141 ++++++++++++++++
>>   drivers/gpu/drm/ttm/ttm_bo.c                  |   1 +
>>   4 files changed, 298 insertions(+)
>>   create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
>>   create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h
>>
>> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
>> index cb8823570996..958ccc1edfed 100644
>> --- a/drivers/gpu/drm/i915/Makefile
>> +++ b/drivers/gpu/drm/i915/Makefile
>> @@ -155,6 +155,7 @@ gem-y += \
>>   	gem/i915_gem_stolen.o \
>>   	gem/i915_gem_throttle.o \
>>   	gem/i915_gem_tiling.o \
>> +	gem/i915_gem_ttm_bo_util.o \
>>   	gem/i915_gem_userptr.o \
>>   	gem/i915_gem_wait.o \
>>   	gem/i915_gemfs.o
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
>> new file mode 100644
>> index 000000000000..1116d7df1461
>> --- /dev/null
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
>> @@ -0,0 +1,155 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2021 Intel Corporation
>> + */
>> +
>> +/**
>> + * DOC: Usage and intentions.
>> + *
>> + * This file contains functionality that we might want to move into
>> + * ttm_bo_util.c if there is a common interest.
>> + * Currently a kmap_local only memcpy with support for page-based iomem regions,
>> + * and fast memcpy from write-combined memory.
>> + */
>> +
>> +#include <linux/dma-buf-map.h>
>> +#include <linux/highmem.h>
>> +#include <linux/io-mapping.h>
>> +#include <linux/scatterlist.h>
>> +
>> +#include "i915_memcpy.h"
>> +
>> +#include "gem/i915_gem_ttm_bo_util.h"
>> +
>> +static void i915_ttm_kmap_iter_tt_kmap_local(struct i915_ttm_kmap_iter *iter,
>> +					     struct dma_buf_map *dmap,
>> +					     pgoff_t i)
>> +{
>> +	struct i915_ttm_kmap_iter_tt *iter_tt =
>> +		container_of(iter, typeof(*iter_tt), base);
>> +
>> +	dma_buf_map_set_vaddr(dmap, kmap_local_page(iter_tt->tt->pages[i]));
>> +}
>> +
>> +static void i915_ttm_kmap_iter_iomap_kmap_local(struct i915_ttm_kmap_iter *iter,
>> +						struct dma_buf_map *dmap,
>> +						pgoff_t i)
>> +{
>> +	struct i915_ttm_kmap_iter_iomap *iter_io =
>> +		container_of(iter, typeof(*iter_io), base);
>> +	void __iomem *addr;
>> +
>> +retry:
>> +	while (i >= iter_io->cache.end) {
>> +		iter_io->cache.sg = iter_io->cache.sg ?
>> +			sg_next(iter_io->cache.sg) : iter_io->st->sgl;
>> +		iter_io->cache.i = iter_io->cache.end;
>> +		iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >>
>> +			PAGE_SHIFT;
>> +		iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) -
>> +			iter_io->start;
>> +	}
>> +
>> +	if (i < iter_io->cache.i) {
>> +		iter_io->cache.end = 0;
>> +		iter_io->cache.sg = NULL;
>> +		goto retry;
>> +	}
>> +
>> +	addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs +
>> +				       (((resource_size_t)i - iter_io->cache.i)
>> +					<< PAGE_SHIFT));
>> +	dma_buf_map_set_vaddr_iomem(dmap, addr);
>> +}
>> +
>> +struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_tt_ops = {
>> +	.kmap_local = i915_ttm_kmap_iter_tt_kmap_local
>> +};
>> +
>> +struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_io_ops = {
>> +	.kmap_local =  i915_ttm_kmap_iter_iomap_kmap_local
>> +};
>> +
>> +static void kunmap_local_dma_buf_map(struct dma_buf_map *map)
>> +{
>> +	if (map->is_iomem)
>> +		io_mapping_unmap_local(map->vaddr_iomem);
>> +	else
>> +		kunmap_local(map->vaddr);
>> +}
>> +
>> +/**
>> + * i915_ttm_move_memcpy - Helper to perform a memcpy ttm move operation.
>> + * @bo: The struct ttm_buffer_object.
>> + * @new_mem: The struct ttm_resource we're moving to (copy destination).
>> + * @new_kmap: A struct i915_ttm_kmap_iter representing the destination resource.
>> + * @old_kmap: A struct i915_ttm_kmap_iter representing the source resource.
>> + */
>> +void i915_ttm_move_memcpy(struct ttm_buffer_object *bo,
>> +			  struct ttm_resource *new_mem,
>> +			  struct i915_ttm_kmap_iter *new_kmap,
>> +			  struct i915_ttm_kmap_iter *old_kmap)
>> +{
>> +	struct ttm_device *bdev = bo->bdev;
>> +	struct ttm_resource_manager *man = ttm_manager_type(bdev, new_mem->mem_type);
>> +	struct ttm_tt *ttm = bo->ttm;
>> +	struct ttm_resource *old_mem = &bo->mem;
>> +	struct ttm_resource old_copy = *old_mem;
>> +	struct ttm_resource_manager *old_man = ttm_manager_type(bdev, old_mem->mem_type);
>> +	struct dma_buf_map old_map, new_map;
>> +	pgoff_t i;
>> +
>> +	/* For the page-based allocator we need sgtable iterators as well.*/
>> +
>> +	/* Single TTM move. NOP */
>> +	if (old_man->use_tt && man->use_tt)
>> +		goto done;
>> +
>> +	/* Don't move nonexistent data. Clear destination instead. */
>> +	if (old_man->use_tt && !man->use_tt &&
>> +	    (ttm == NULL || !ttm_tt_is_populated(ttm))) {
>> +		if (ttm && !(ttm->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC))
>> +			goto done;
>> +
>> +		for (i = 0; i < new_mem->num_pages; ++i) {
>> +			new_kmap->ops->kmap_local(new_kmap, &new_map, i);
>> +			memset_io(new_map.vaddr_iomem, 0, PAGE_SIZE);
>> +			kunmap_local_dma_buf_map(&new_map);
>> +		}
>> +		goto done;
>> +	}
>> +
>> +	for (i = 0; i < new_mem->num_pages; ++i) {
>> +		new_kmap->ops->kmap_local(new_kmap, &new_map, i);
>> +		old_kmap->ops->kmap_local(old_kmap, &old_map, i);
>> +		if (!old_map.is_iomem ||
>> +		    !i915_memcpy_from_wc(new_map.vaddr, old_map.vaddr, PAGE_SIZE)) {
>> +			if (!old_map.is_iomem) {
>> +				dma_buf_map_memcpy_to(&new_map, old_map.vaddr,
>> +						      PAGE_SIZE);
>> +			} else if (!new_map.is_iomem) {
>> +				memcpy_fromio(new_map.vaddr, old_map.vaddr_iomem,
>> +					      PAGE_SIZE);
>> +			} else {
>> +				pgoff_t j;
>> +				u32 __iomem *src = old_map.vaddr_iomem;
>> +				u32 __iomem *dst = new_map.vaddr_iomem;
>> +
>> +				for (j = 0; j < (PAGE_SIZE >> 2); ++j)
>> +					iowrite32(ioread32(src++), dst++);
>> +			}
>> +		}
>> +		kunmap_local_dma_buf_map(&old_map);
>> +		kunmap_local_dma_buf_map(&new_map);
>> +	}
>> +
>> +done:
>> +	old_copy = *old_mem;
>> +
>> +	ttm_bo_assign_mem(bo, new_mem);
>> +
>> +	if (!man->use_tt)
>> +		ttm_bo_tt_destroy(bo);
>> +
>> +	ttm_resource_free(bo, &old_copy);
>> +}
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h
>> new file mode 100644
>> index 000000000000..82c92176718d
>> --- /dev/null
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h
>> @@ -0,0 +1,141 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2021 Intel Corporation
>> + */
>> +
>> +/*
>> + * This files contains functionality that we might want to move into
>> + * ttm_bo_util.c if there is a common interest.
>> + */
>> +#ifndef _I915_GEM_TTM_BO_UTIL_H_
>> +#define _I915_GEM_TTM_BO_UTIL_H_
>> +
>> +#include <drm/ttm/ttm_bo_driver.h>
>> +struct dma_buf_map;
>> +struct io_mapping;
>> +struct sg_table;
>> +struct scatterlist;
>> +
>> +struct ttm_tt;
>> +struct i915_ttm_kmap_iter;
>> +
>> +/**
>> + * struct i915_ttm_kmap_iter_ops - Ops structure for a struct
>> + * i915_ttm_kmap_iter.
>> + */
>> +struct i915_ttm_kmap_iter_ops {
>> +	/**
>> +	 * kmap_local - Map a PAGE_SIZE part of the resource using
>> +	 * kmap_local semantics.
>> +	 * @res_kmap: Pointer to the struct i915_ttm_kmap_iter representing
>> +	 * the resource.
>> +	 * @dmap: The struct dma_buf_map holding the virtual address after
>> +	 * the operation.
>> +	 * @i: The location within the resource to map. PAGE_SIZE granularity.
>> +	 */
>> +	void (*kmap_local)(struct i915_ttm_kmap_iter *res_kmap,
>> +			   struct dma_buf_map *dmap, pgoff_t i);
>> +};
>> +
>> +/**
>> + * struct i915_ttm_kmap_iter - Iterator for kmap_local type operations on a
>> + * resource.
>> + * @ops: Pointer to the operations struct.
>> + *
>> + * This struct is intended to be embedded in a resource-specific specialization
>> + * implementing operations for the resource.
>> + *
>> + * Nothing stops us from extending the operations to vmap, vmap_pfn etc,
>> + * replacing some or parts of the ttm_bo_util. cpu-map functionality.
>> + */
>> +struct i915_ttm_kmap_iter {
>> +	const struct i915_ttm_kmap_iter_ops *ops;
>> +};
>> +
>> +/**
>> + * struct i915_ttm_kmap_iter_tt - Specialization for a tt (page) backed struct
>> + * ttm_resource.
>> + * @base: Embedded struct i915_ttm_kmap_iter providing the usage interface
>> + * @tt: Cached struct ttm_tt.
>> + */
>> +struct i915_ttm_kmap_iter_tt {
>> +	struct i915_ttm_kmap_iter base;
>> +	struct ttm_tt *tt;
>> +};
>> +
>> +/**
>> + * struct i915_ttm_kmap_iter_iomap - Specialization for a struct io_mapping +
>> + * struct sg_table backed struct ttm_resource.
>> + * @base: Embedded struct i915_ttm_kmap_iter providing the usage interface.
>> + * @iomap: struct io_mapping representing the underlying linear io_memory.
>> + * @st: sg_table into @iomap, representing the memory of the struct ttm_resource.
>> + * @start: Offset that needs to be subtracted from @st to make
>> + * sg_dma_address(st->sgl) - @start == 0 for @iomap start.
>> + * @cache: Scatterlist traversal cache for fast lookups.
>> + * @cache.sg: Pointer to the currently cached scatterlist segment.
>> + * @cache.i: First index of @sg. PAGE_SIZE granularity.
>> + * @cache.end: Last index + 1 of @sg. PAGE_SIZE granularity.
>> + * @cache.offs: First offset into @iomap of @sg. PAGE_SIZE granularity.
>> + */
>> +struct i915_ttm_kmap_iter_iomap {
>> +	struct i915_ttm_kmap_iter base;
>> +	struct io_mapping *iomap;
>> +	struct sg_table *st;
>> +	resource_size_t start;
>> +	struct {
>> +		struct scatterlist *sg;
>> +		pgoff_t i;
>> +		pgoff_t end;
>> +		pgoff_t offs;
>> +	} cache;
>> +};
>> +
>> +extern struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_tt_ops;
>> +extern struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_io_ops;
>> +
>> +/**
>> + * i915_ttm_kmap_iter_iomap_init - Initialize a struct i915_ttm_kmap_iter_iomap
>> + * @iter_io: The struct i915_ttm_kmap_iter_iomap to initialize.
>> + * @iomap: The struct io_mapping representing the underlying linear io_memory.
>> + * @st: sg_table into @iomap, representing the memory of the struct
>> + * ttm_resource.
>> + * @start: Offset that needs to be subtracted from @st to make
>> + * sg_dma_address(st->sgl) - @start == 0 for @iomap start.
>> + *
>> + * Return: Pointer to the embedded struct i915_ttm_kmap_iter.
>> + */
>> +static inline struct i915_ttm_kmap_iter *
>> +i915_ttm_kmap_iter_iomap_init(struct i915_ttm_kmap_iter_iomap *iter_io,
>> +			      struct io_mapping *iomap,
>> +			      struct sg_table *st,
>> +			      resource_size_t start)
>> +{
>> +	iter_io->base.ops = &i915_ttm_kmap_iter_io_ops;
>> +	iter_io->iomap = iomap;
>> +	iter_io->st = st;
>> +	iter_io->start = start;
>> +	memset(&iter_io->cache, 0, sizeof(iter_io->cache));
>> +	return &iter_io->base;
>> +}
>> +
>> +/**
>> + * ttm_kmap_iter_tt_init - Initialize a struct i915_ttm_kmap_iter_tt
>> + * @iter_tt: The struct i915_ttm_kmap_iter_tt to initialize.
>> + * @tt: Struct ttm_tt holding page pointers of the struct ttm_resource.
>> + *
>> + * Return: Pointer to the embedded struct i915_ttm_kmap_iter.
>> + */
>> +static inline struct i915_ttm_kmap_iter *
>> +i915_ttm_kmap_iter_tt_init(struct i915_ttm_kmap_iter_tt *iter_tt,
>> +			   struct ttm_tt *tt)
>> +{
>> +	iter_tt->base.ops = &i915_ttm_kmap_iter_tt_ops;
>> +	iter_tt->tt = tt;
>> +	return &iter_tt->base;
>> +}
> Do there functions have a valid *performance* reason to be inline? I
> think that's pretty much the only valid reason.
>
> Having these inline forces i915_ttm_kmap_iter_*_ops extern, and they
> should really be static. Inline functions complicate header dependencies
> and leak the abstractions.
>
> BR,
> Jani.

Hi,

Thanks for reviewing. I don't think there really is a performance reason 
for keeping these functions inline. While in this case there is not 
really much change either in leaking abstractions nor in header 
dependencies I agree keeping those ops static is probably a better 
choice. I'll respin.

Thanks,

Thomas