[Intel-gfx] [PATCH 01/24] Revert "drm/i915/gem: Async GPU relocations only"
Daniel Vetter
daniel at ffwll.ch
Tue Aug 11 12:11:40 UTC 2020
On Mon, Aug 10, 2020 at 12:30:40PM +0200, Maarten Lankhorst wrote:
> This reverts commit 9e0f9464e2ab36b864359a59b0e9058fdef0ce47,
Missed one, you need to dim cite the above so the commit subject is
included. Also too long sha1.
-Daniel
> and related commit 7ac2d2536dfa7 ("drm/i915/gem: Delete unused code").
>
> Async GPU relocations are not the path forward, we want to remove
> GPU accelerated relocation support eventually when userspace is fixed
> to use VM_BIND, and this is the first step towards that. We will keep
> async gpu relocations around for now, until userspace is fixed.
>
> Relocation support will be disabled completely on platforms where there
> was never any userspace that depends on it, as the hardware doesn't
> require it from at least gen9+ onward. For older platforms, the plan
> is to use cpu relocations only.
>
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
> ---
> .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 319 ++++++++++++++++--
> .../i915/gem/selftests/i915_gem_execbuffer.c | 21 +-
> 2 files changed, 313 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 24a1486d2dc5..c6a613d92a13 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -46,6 +46,13 @@ struct eb_vma_array {
> struct eb_vma vma[];
> };
>
> +enum {
> + FORCE_CPU_RELOC = 1,
> + FORCE_GTT_RELOC,
> + FORCE_GPU_RELOC,
> +#define DBG_FORCE_RELOC 0 /* choose one of the above! */
> +};
> +
> #define __EXEC_OBJECT_HAS_PIN BIT(31)
> #define __EXEC_OBJECT_HAS_FENCE BIT(30)
> #define __EXEC_OBJECT_NEEDS_MAP BIT(29)
> @@ -261,6 +268,8 @@ struct i915_execbuffer {
> */
> struct reloc_cache {
> struct drm_mm_node node; /** temporary GTT binding */
> + unsigned long vaddr; /** Current kmap address */
> + unsigned long page; /** Currently mapped page index */
> unsigned int gen; /** Cached value of INTEL_GEN */
> bool use_64bit_reloc : 1;
> bool has_llc : 1;
> @@ -607,6 +616,23 @@ eb_add_vma(struct i915_execbuffer *eb,
> }
> }
>
> +static inline int use_cpu_reloc(const struct reloc_cache *cache,
> + const struct drm_i915_gem_object *obj)
> +{
> + if (!i915_gem_object_has_struct_page(obj))
> + return false;
> +
> + if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
> + return true;
> +
> + if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
> + return false;
> +
> + return (cache->has_llc ||
> + obj->cache_dirty ||
> + obj->cache_level != I915_CACHE_NONE);
> +}
> +
> static int eb_reserve_vma(const struct i915_execbuffer *eb,
> struct eb_vma *ev,
> u64 pin_flags)
> @@ -937,6 +963,8 @@ relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
> static void reloc_cache_init(struct reloc_cache *cache,
> struct drm_i915_private *i915)
> {
> + cache->page = -1;
> + cache->vaddr = 0;
> /* Must be a variable in the struct to allow GCC to unroll. */
> cache->gen = INTEL_GEN(i915);
> cache->has_llc = HAS_LLC(i915);
> @@ -948,6 +976,25 @@ static void reloc_cache_init(struct reloc_cache *cache,
> cache->target = NULL;
> }
>
> +static inline void *unmask_page(unsigned long p)
> +{
> + return (void *)(uintptr_t)(p & PAGE_MASK);
> +}
> +
> +static inline unsigned int unmask_flags(unsigned long p)
> +{
> + return p & ~PAGE_MASK;
> +}
> +
> +#define KMAP 0x4 /* after CLFLUSH_FLAGS */
> +
> +static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
> +{
> + struct drm_i915_private *i915 =
> + container_of(cache, struct i915_execbuffer, reloc_cache)->i915;
> + return &i915->ggtt;
> +}
> +
> #define RELOC_TAIL 4
>
> static int reloc_gpu_chain(struct reloc_cache *cache)
> @@ -1060,6 +1107,186 @@ static int reloc_gpu_flush(struct reloc_cache *cache)
> return err;
> }
>
> +static void reloc_cache_reset(struct reloc_cache *cache)
> +{
> + void *vaddr;
> +
> + if (!cache->vaddr)
> + return;
> +
> + vaddr = unmask_page(cache->vaddr);
> + if (cache->vaddr & KMAP) {
> + if (cache->vaddr & CLFLUSH_AFTER)
> + mb();
> +
> + kunmap_atomic(vaddr);
> + i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm);
> + } else {
> + struct i915_ggtt *ggtt = cache_to_ggtt(cache);
> +
> + intel_gt_flush_ggtt_writes(ggtt->vm.gt);
> + io_mapping_unmap_atomic((void __iomem *)vaddr);
> +
> + if (drm_mm_node_allocated(&cache->node)) {
> + ggtt->vm.clear_range(&ggtt->vm,
> + cache->node.start,
> + cache->node.size);
> + mutex_lock(&ggtt->vm.mutex);
> + drm_mm_remove_node(&cache->node);
> + mutex_unlock(&ggtt->vm.mutex);
> + } else {
> + i915_vma_unpin((struct i915_vma *)cache->node.mm);
> + }
> + }
> +
> + cache->vaddr = 0;
> + cache->page = -1;
> +}
> +
> +static void *reloc_kmap(struct drm_i915_gem_object *obj,
> + struct reloc_cache *cache,
> + unsigned long pageno)
> +{
> + void *vaddr;
> + struct page *page;
> +
> + if (cache->vaddr) {
> + kunmap_atomic(unmask_page(cache->vaddr));
> + } else {
> + unsigned int flushes;
> + int err;
> +
> + err = i915_gem_object_prepare_write(obj, &flushes);
> + if (err)
> + return ERR_PTR(err);
> +
> + BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS);
> + BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK);
> +
> + cache->vaddr = flushes | KMAP;
> + cache->node.mm = (void *)obj;
> + if (flushes)
> + mb();
> + }
> +
> + page = i915_gem_object_get_page(obj, pageno);
> + if (!obj->mm.dirty)
> + set_page_dirty(page);
> +
> + vaddr = kmap_atomic(page);
> + cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr;
> + cache->page = pageno;
> +
> + return vaddr;
> +}
> +
> +static void *reloc_iomap(struct drm_i915_gem_object *obj,
> + struct reloc_cache *cache,
> + unsigned long page)
> +{
> + struct i915_ggtt *ggtt = cache_to_ggtt(cache);
> + unsigned long offset;
> + void *vaddr;
> +
> + if (cache->vaddr) {
> + intel_gt_flush_ggtt_writes(ggtt->vm.gt);
> + io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr));
> + } else {
> + struct i915_vma *vma;
> + int err;
> +
> + if (i915_gem_object_is_tiled(obj))
> + return ERR_PTR(-EINVAL);
> +
> + if (use_cpu_reloc(cache, obj))
> + return NULL;
> +
> + i915_gem_object_lock(obj);
> + err = i915_gem_object_set_to_gtt_domain(obj, true);
> + i915_gem_object_unlock(obj);
> + if (err)
> + return ERR_PTR(err);
> +
> + vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
> + PIN_MAPPABLE |
> + PIN_NONBLOCK /* NOWARN */ |
> + PIN_NOEVICT);
> + if (IS_ERR(vma)) {
> + memset(&cache->node, 0, sizeof(cache->node));
> + mutex_lock(&ggtt->vm.mutex);
> + err = drm_mm_insert_node_in_range
> + (&ggtt->vm.mm, &cache->node,
> + PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE,
> + 0, ggtt->mappable_end,
> + DRM_MM_INSERT_LOW);
> + mutex_unlock(&ggtt->vm.mutex);
> + if (err) /* no inactive aperture space, use cpu reloc */
> + return NULL;
> + } else {
> + cache->node.start = vma->node.start;
> + cache->node.mm = (void *)vma;
> + }
> + }
> +
> + offset = cache->node.start;
> + if (drm_mm_node_allocated(&cache->node)) {
> + ggtt->vm.insert_page(&ggtt->vm,
> + i915_gem_object_get_dma_address(obj, page),
> + offset, I915_CACHE_NONE, 0);
> + } else {
> + offset += page << PAGE_SHIFT;
> + }
> +
> + vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap,
> + offset);
> + cache->page = page;
> + cache->vaddr = (unsigned long)vaddr;
> +
> + return vaddr;
> +}
> +
> +static void *reloc_vaddr(struct drm_i915_gem_object *obj,
> + struct reloc_cache *cache,
> + unsigned long page)
> +{
> + void *vaddr;
> +
> + if (cache->page == page) {
> + vaddr = unmask_page(cache->vaddr);
> + } else {
> + vaddr = NULL;
> + if ((cache->vaddr & KMAP) == 0)
> + vaddr = reloc_iomap(obj, cache, page);
> + if (!vaddr)
> + vaddr = reloc_kmap(obj, cache, page);
> + }
> +
> + return vaddr;
> +}
> +
> +static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
> +{
> + if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
> + if (flushes & CLFLUSH_BEFORE) {
> + clflushopt(addr);
> + mb();
> + }
> +
> + *addr = value;
> +
> + /*
> + * Writes to the same cacheline are serialised by the CPU
> + * (including clflush). On the write path, we only require
> + * that it hits memory in an orderly fashion and place
> + * mb barriers at the start and end of the relocation phase
> + * to ensure ordering of clflush wrt to the system.
> + */
> + if (flushes & CLFLUSH_AFTER)
> + clflushopt(addr);
> + } else
> + *addr = value;
> +}
> +
> static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma)
> {
> struct drm_i915_gem_object *obj = vma->obj;
> @@ -1225,6 +1452,17 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
> return cmd;
> }
>
> +static inline bool use_reloc_gpu(struct i915_vma *vma)
> +{
> + if (DBG_FORCE_RELOC == FORCE_GPU_RELOC)
> + return true;
> +
> + if (DBG_FORCE_RELOC)
> + return false;
> +
> + return !dma_resv_test_signaled_rcu(vma->resv, true);
> +}
> +
> static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
> {
> struct page *page;
> @@ -1239,10 +1477,10 @@ static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
> return addr + offset_in_page(offset);
> }
>
> -static int __reloc_entry_gpu(struct i915_execbuffer *eb,
> - struct i915_vma *vma,
> - u64 offset,
> - u64 target_addr)
> +static bool __reloc_entry_gpu(struct i915_execbuffer *eb,
> + struct i915_vma *vma,
> + u64 offset,
> + u64 target_addr)
> {
> const unsigned int gen = eb->reloc_cache.gen;
> unsigned int len;
> @@ -1258,7 +1496,7 @@ static int __reloc_entry_gpu(struct i915_execbuffer *eb,
>
> batch = reloc_gpu(eb, vma, len);
> if (IS_ERR(batch))
> - return PTR_ERR(batch);
> + return false;
>
> addr = gen8_canonical_addr(vma->node.start + offset);
> if (gen >= 8) {
> @@ -1307,21 +1545,55 @@ static int __reloc_entry_gpu(struct i915_execbuffer *eb,
> *batch++ = target_addr;
> }
>
> - return 0;
> + return true;
> +}
> +
> +static bool reloc_entry_gpu(struct i915_execbuffer *eb,
> + struct i915_vma *vma,
> + u64 offset,
> + u64 target_addr)
> +{
> + if (eb->reloc_cache.vaddr)
> + return false;
> +
> + if (!use_reloc_gpu(vma))
> + return false;
> +
> + return __reloc_entry_gpu(eb, vma, offset, target_addr);
> }
>
> static u64
> -relocate_entry(struct i915_execbuffer *eb,
> - struct i915_vma *vma,
> +relocate_entry(struct i915_vma *vma,
> const struct drm_i915_gem_relocation_entry *reloc,
> + struct i915_execbuffer *eb,
> const struct i915_vma *target)
> {
> u64 target_addr = relocation_target(reloc, target);
> - int err;
> -
> - err = __reloc_entry_gpu(eb, vma, reloc->offset, target_addr);
> - if (err)
> - return err;
> + u64 offset = reloc->offset;
> +
> + if (!reloc_entry_gpu(eb, vma, offset, target_addr)) {
> + bool wide = eb->reloc_cache.use_64bit_reloc;
> + void *vaddr;
> +
> +repeat:
> + vaddr = reloc_vaddr(vma->obj,
> + &eb->reloc_cache,
> + offset >> PAGE_SHIFT);
> + if (IS_ERR(vaddr))
> + return PTR_ERR(vaddr);
> +
> + GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32)));
> + clflush_write32(vaddr + offset_in_page(offset),
> + lower_32_bits(target_addr),
> + eb->reloc_cache.vaddr);
> +
> + if (wide) {
> + offset += sizeof(u32);
> + target_addr >>= 32;
> + wide = false;
> + goto repeat;
> + }
> + }
>
> return target->node.start | UPDATE;
> }
> @@ -1386,7 +1658,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
> * If the relocation already has the right value in it, no
> * more work needs to be done.
> */
> - if (gen8_canonical_addr(target->vma->node.start) == reloc->presumed_offset)
> + if (!DBG_FORCE_RELOC &&
> + gen8_canonical_addr(target->vma->node.start) == reloc->presumed_offset)
> return 0;
>
> /* Check that the relocation address is valid... */
> @@ -1418,7 +1691,7 @@ eb_relocate_entry(struct i915_execbuffer *eb,
> ev->flags &= ~EXEC_OBJECT_ASYNC;
>
> /* and update the user's relocation entry */
> - return relocate_entry(eb, ev->vma, reloc, target->vma);
> + return relocate_entry(ev->vma, reloc, eb, target->vma);
> }
>
> static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
> @@ -1456,8 +1729,10 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
> * this is bad and so lockdep complains vehemently.
> */
> copied = __copy_from_user(r, urelocs, count * sizeof(r[0]));
> - if (unlikely(copied))
> - return -EFAULT;
> + if (unlikely(copied)) {
> + remain = -EFAULT;
> + goto out;
> + }
>
> remain -= count;
> do {
> @@ -1465,7 +1740,8 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
>
> if (likely(offset == 0)) {
> } else if ((s64)offset < 0) {
> - return (int)offset;
> + remain = (int)offset;
> + goto out;
> } else {
> /*
> * Note that reporting an error now
> @@ -1495,8 +1771,9 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
> } while (r++, --count);
> urelocs += ARRAY_SIZE(stack);
> } while (remain);
> -
> - return 0;
> +out:
> + reloc_cache_reset(&eb->reloc_cache);
> + return remain;
> }
>
> static int eb_relocate(struct i915_execbuffer *eb)
> @@ -2573,7 +2850,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
> eb.i915 = i915;
> eb.file = file;
> eb.args = args;
> - if (!(args->flags & I915_EXEC_NO_RELOC))
> + if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
> args->flags |= __EXEC_HAS_RELOC;
>
> eb.exec = exec;
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> index 57c14d3340cd..a49016f8ee0d 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> @@ -37,14 +37,20 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
> return err;
>
> /* 8-Byte aligned */
> - err = __reloc_entry_gpu(eb, vma, offsets[0] * sizeof(u32), 0);
> - if (err)
> + if (!__reloc_entry_gpu(eb, vma,
> + offsets[0] * sizeof(u32),
> + 0)) {
> + err = -EIO;
> goto unpin_vma;
> + }
>
> /* !8-Byte aligned */
> - err = __reloc_entry_gpu(eb, vma, offsets[1] * sizeof(u32), 1);
> - if (err)
> + if (!__reloc_entry_gpu(eb, vma,
> + offsets[1] * sizeof(u32),
> + 1)) {
> + err = -EIO;
> goto unpin_vma;
> + }
>
> /* Skip to the end of the cmd page */
> i = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1;
> @@ -54,9 +60,12 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
> eb->reloc_cache.rq_size += i;
>
> /* Force batch chaining */
> - err = __reloc_entry_gpu(eb, vma, offsets[2] * sizeof(u32), 2);
> - if (err)
> + if (!__reloc_entry_gpu(eb, vma,
> + offsets[2] * sizeof(u32),
> + 2)) {
> + err = -EIO;
> goto unpin_vma;
> + }
>
> GEM_BUG_ON(!eb->reloc_cache.rq);
> rq = i915_request_get(eb->reloc_cache.rq);
> --
> 2.28.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
More information about the Intel-gfx
mailing list