[Intel-gfx] [PATCH 8/8] drm/i915: Refactor execbuffer relocation writing
Joonas Lahtinen
joonas.lahtinen at linux.intel.com
Thu Jun 9 13:31:25 UTC 2016
On to, 2016-06-09 at 12:29 +0100, Chris Wilson wrote:
> With in the introduction of the reloc page cache, we are just one step
> away from refactoring the relocation write functions into one. Not only
> does it tidy the code (slightly), but it greatly simplifies the control
> logic much to gcc's satisfaction.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 289 +++++++++++++++--------------
> 1 file changed, 150 insertions(+), 139 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 318c71b663f4..bda8ec8b02e6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -34,6 +34,8 @@
> #include
> #include
>
> +#define DBG_USE_CPU_RELOC 0 /* force relocations to use the CPU write path */
Better connect to the new debug Kconfig menu you introduced?
> +
> #define __EXEC_OBJECT_HAS_PIN (1U<<31)
> #define __EXEC_OBJECT_HAS_FENCE (1U<<30)
> #define __EXEC_OBJECT_NEEDS_MAP (1U<<29)
> @@ -53,6 +55,7 @@ struct i915_execbuffer_params {
> };
>
> struct eb_vmas {
> + struct drm_i915_private *i915;
> struct list_head vmas;
> int and;
> union {
> @@ -62,7 +65,8 @@ struct eb_vmas {
> };
>
> static struct eb_vmas *
> -eb_create(struct drm_i915_gem_execbuffer2 *args)
> +eb_create(struct drm_i915_private *i915,
> + struct drm_i915_gem_execbuffer2 *args)
> {
> struct eb_vmas *eb = NULL;
>
> @@ -89,6 +93,7 @@ eb_create(struct drm_i915_gem_execbuffer2 *args)
> } else
> eb->and = -args->buffer_count;
>
> + eb->i915 = i915;
> INIT_LIST_HEAD(&eb->vmas);
> return eb;
> }
> @@ -272,7 +277,8 @@ static void eb_destroy(struct eb_vmas *eb)
>
> static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
> {
> - return (HAS_LLC(obj->base.dev) ||
> + return (DBG_USE_CPU_RELOC ||
> + HAS_LLC(obj->base.dev) ||
> obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
> obj->cache_level != I915_CACHE_NONE);
> }
> @@ -296,32 +302,58 @@ static inline uint64_t gen8_noncanonical_addr(uint64_t address)
> }
>
> static inline uint64_t
> -relocation_target(struct drm_i915_gem_relocation_entry *reloc,
> +relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
These const changes could be a bunch instead of sprinkled around.
Unless we want to smuggle them in through the resistance.
> uint64_t target_offset)
> {
> return gen8_canonical_addr((int)reloc->delta + target_offset);
> }
>
> struct reloc_cache {
> - void *vaddr;
> + struct drm_i915_private *i915;
> + unsigned long vaddr;
> unsigned page;
> - enum { KMAP, IOMAP } type;
> + struct drm_mm_node node;
> + bool use_64bit_reloc;
> };
>
> -static void reloc_cache_init(struct reloc_cache *cache)
> +static void reloc_cache_init(struct reloc_cache *cache,
> + struct drm_i915_private *i915)
> {
> cache->page = -1;
> - cache->vaddr = NULL;
> + cache->vaddr = 0;
> + cache->i915 = i915;
> + cache->use_64bit_reloc = INTEL_GEN(cache->i915) >= 8;
> +}
> +
> +static inline void *unmask_page(unsigned long p)
> +{
> + return (void *)(uintptr_t)(p & PAGE_MASK);
> }
>
> +static inline unsigned unmask_flags(unsigned long p)
> +{
> + return p & ~PAGE_MASK;
> +}
> +
> +#define KMAP 0x4
> +
> static void reloc_cache_fini(struct reloc_cache *cache)
> {
> - if (cache->vaddr == NULL)
> + void *vaddr;
> +
> + if (cache->vaddr == 0)
> return;
>
> - switch (cache->type) {
> - case KMAP: kunmap_atomic(cache->vaddr); break;
> - case IOMAP: io_mapping_unmap_atomic(cache->vaddr); break;
> + vaddr = unmask_page(cache->vaddr);
> + if (cache->vaddr & KMAP) {
> + if (cache->vaddr & CLFLUSH_AFTER)
> + mb();
> +
> + kunmap_atomic(vaddr);
> + i915_gem_object_unpin_pages((struct drm_i915_gem_object *)cache->node.mm);
Rather long line.
> + } else {
> + io_mapping_unmap_atomic(vaddr);
> + i915_vma_unpin((struct i915_vma *)cache->node.mm);
> }
> }
>
> @@ -329,148 +361,138 @@ static void *reloc_kmap(struct drm_i915_gem_object *obj,
> struct reloc_cache *cache,
> int page)
> {
> - if (cache->page == page)
> - return cache->vaddr;
> + void *vaddr;
>
> - if (cache->vaddr)
> - kunmap_atomic(cache->vaddr);
> + if (cache->vaddr) {
> + kunmap_atomic(unmask_page(cache->vaddr));
> + } else {
> + unsigned flushes;
> + int ret;
> +
> + ret = i915_gem_obj_prepare_shmem_write(obj, &flushes);
Yeah, needs_clflush is so bad name earlier in the series, that even you
don't use it. "need_clflushes" or anything is better.
> + if (ret)
> + return ERR_PTR(ret);
>
> + cache->vaddr = flushes | KMAP;
> + cache->node.mm = (void *)obj;
> + if (flushes)
> + mb();
> + }
> +
> + vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
> + cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr;
> cache->page = page;
> - cache->vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
> - cache->type = KMAP;
>
> - return cache->vaddr;
> + return vaddr;
> }
>
> -static int
> -relocate_entry_cpu(struct drm_i915_gem_object *obj,
> - struct drm_i915_gem_relocation_entry *reloc,
> - struct reloc_cache *cache,
> - uint64_t target_offset)
> +static void *reloc_iomap(struct drm_i915_gem_object *obj,
> + struct reloc_cache *cache,
> + int page)
> {
> - struct drm_device *dev = obj->base.dev;
> - uint32_t page_offset = offset_in_page(reloc->offset);
> - uint64_t delta = relocation_target(reloc, target_offset);
> - char *vaddr;
> - int ret;
> + void *vaddr;
>
> - ret = i915_gem_object_set_to_cpu_domain(obj, true);
> - if (ret)
> - return ret;
> + if (cache->vaddr) {
> + io_mapping_unmap_atomic(unmask_page(cache->vaddr));
> + } else {
> + struct i915_vma *vma;
> + int ret;
>
> - vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
> - *(uint32_t *)(vaddr + page_offset) = lower_32_bits(delta);
> + if (use_cpu_reloc(obj))
> + return NULL;
>
> - if (INTEL_GEN(dev) >= 8) {
> - page_offset += sizeof(uint32_t);
> - if (page_offset == PAGE_SIZE) {
> - vaddr = reloc_kmap(obj, cache, cache->page + 1);
> - page_offset = 0;
> - }
> - *(uint32_t *)(vaddr + page_offset) = upper_32_bits(delta);
> - }
> + vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
> + PIN_MAPPABLE | PIN_NONBLOCK);
> + if (IS_ERR(vma))
> + return NULL;
>
> - return 0;
> -}
Ugh, did you simultaneously move and rename functions. This is rather
hard to follow from this point on...
Regards, Joonas
> + ret = i915_gem_object_set_to_gtt_domain(obj, true);
> + if (ret)
> + return ERR_PTR(ret);
>
> -static void *reloc_iomap(struct drm_i915_private *i915,
> - struct reloc_cache *cache,
> - uint64_t offset)
> -{
> - if (cache->page == offset >> PAGE_SHIFT)
> - return cache->vaddr;
> + ret = i915_gem_object_put_fence(obj);
> + if (ret)
> + return ERR_PTR(ret);
>
> - if (cache->vaddr)
> - io_mapping_unmap_atomic(cache->vaddr);
> + cache->node.start = vma->node.start;
> + cache->node.mm = (void *)vma;
> + }
>
> - cache->page = offset >> PAGE_SHIFT;
> - cache->vaddr =
> - io_mapping_map_atomic_wc(i915->ggtt.mappable,
> - offset & PAGE_MASK);
> - cache->type = IOMAP;
> + vaddr = io_mapping_map_atomic_wc(cache->i915->ggtt.mappable,
> + cache->node.start + (page << PAGE_SHIFT));
> + cache->page = page;
> + cache->vaddr = (unsigned long)vaddr;
>
> - return cache->vaddr;
> + return vaddr;
> }
>
> -static int
> -relocate_entry_gtt(struct drm_i915_gem_object *obj,
> - struct drm_i915_gem_relocation_entry *reloc,
> - struct reloc_cache *cache,
> - uint64_t target_offset)
> +static void *reloc_vaddr(struct drm_i915_gem_object *obj,
> + struct reloc_cache *cache,
> + int page)
> {
> - struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
> - struct i915_vma *vma;
> - uint64_t delta = relocation_target(reloc, target_offset);
> - uint64_t offset;
> - void __iomem *reloc_page;
> - int ret;
> -
> - vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, PIN_MAPPABLE);
> - if (IS_ERR(vma))
> - return PTR_ERR(vma);
> -
> - ret = i915_gem_object_set_to_gtt_domain(obj, true);
> - if (ret)
> - goto unpin;
> -
> - ret = i915_gem_object_put_fence(obj);
> - if (ret)
> - goto unpin;
> -
> - /* Map the page containing the relocation we're going to perform. */
> - offset = vma->node.start;
> - offset += reloc->offset;
> - reloc_page = reloc_iomap(dev_priv, cache, offset);
> - iowrite32(lower_32_bits(delta), reloc_page + offset_in_page(offset));
> + void *vaddr;
>
> - if (INTEL_GEN(dev_priv) >= 8) {
> - offset += sizeof(uint32_t);
> - if (offset_in_page(offset) == 0)
> - reloc_page = reloc_iomap(dev_priv, cache, offset);
> - iowrite32(upper_32_bits(delta),
> - reloc_page + offset_in_page(offset));
> + if (cache->page == page) {
> + vaddr = unmask_page(cache->vaddr);
> + } else {
> + vaddr = NULL;
> + if ((cache->vaddr & KMAP) == 0)
> + vaddr = reloc_iomap(obj, cache, page);
> + if (vaddr == NULL)
> + vaddr = reloc_kmap(obj, cache, page);
> }
>
> -unpin:
> - __i915_vma_unpin(vma);
> - return ret;
> + return vaddr;
> }
>
> static void
> -clflush_write32(void *addr, uint32_t value)
> +clflush_write32(uint32_t *addr, uint32_t value, unsigned flushes)
> {
> - /* This is not a fast path, so KISS. */
> - drm_clflush_virt_range(addr, sizeof(uint32_t));
> - *(uint32_t *)addr = value;
> - drm_clflush_virt_range(addr, sizeof(uint32_t));
> + if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
> + if (flushes & CLFLUSH_BEFORE) {
> + clflushopt(addr);
> + mb();
> + }
> +
> + *addr = value;
> +
> + /* Writes to the same cacheline are serialised by the CPU
> + * (including clflush). On the write path, we only require
> + * that it hits memory in an orderly fashion and place
> + * mb barriers at the start and end of the relocation phase
> + * to ensure ordering of clflush wrt to the system.
> + */
> + if (flushes & CLFLUSH_AFTER)
> + clflushopt(addr);
> + } else
> + *addr = value;
> }
>
> static int
> -relocate_entry_clflush(struct drm_i915_gem_object *obj,
> - struct drm_i915_gem_relocation_entry *reloc,
> - struct reloc_cache *cache,
> - uint64_t target_offset)
> +relocate_entry(struct drm_i915_gem_object *obj,
> + const struct drm_i915_gem_relocation_entry *reloc,
> + struct reloc_cache *cache,
> + uint64_t target_offset)
> {
> - struct drm_device *dev = obj->base.dev;
> - uint32_t page_offset = offset_in_page(reloc->offset);
> - uint64_t delta = relocation_target(reloc, target_offset);
> - char *vaddr;
> - int ret;
> + uint64_t offset = reloc->offset;
> + bool wide = cache->use_64bit_reloc;
> + void *vaddr;
>
> - ret = i915_gem_object_set_to_gtt_domain(obj, true);
> - if (ret)
> - return ret;
> + target_offset = relocation_target(reloc, target_offset);
> +repeat:
> + vaddr = reloc_vaddr(obj, cache, offset >> PAGE_SHIFT);
> + if (IS_ERR(vaddr))
> + return PTR_ERR(vaddr);
>
> - vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
> - clflush_write32(vaddr + page_offset, lower_32_bits(delta));
> + clflush_write32(vaddr + offset_in_page(offset),
> + lower_32_bits(target_offset),
> + cache->vaddr);
>
> - if (INTEL_GEN(dev) >= 8) {
> - page_offset += sizeof(uint32_t);
> - if (page_offset == PAGE_SIZE) {
> - vaddr = reloc_kmap(obj, cache, cache->page + 1);
> - page_offset = 0;
> - }
> - clflush_write32(vaddr + page_offset, upper_32_bits(delta));
> + if (wide) {
> + offset += sizeof(uint32_t);
> + target_offset >>= 32;
> + wide = false;
> + goto repeat;
> }
>
> return 0;
> @@ -557,7 +579,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>
> /* Check that the relocation address is valid... */
> if (unlikely(reloc->offset >
> - obj->base.size - (INTEL_INFO(dev)->gen >= 8 ? 8 : 4))) {
> + obj->base.size - (cache->use_64bit_reloc ? 8 : 4))) {
> DRM_DEBUG("Relocation beyond object bounds: "
> "obj %p target %d offset %d size %d.\n",
> obj, reloc->target_handle,
> @@ -577,23 +599,12 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> if (pagefault_disabled() && !object_is_idle(obj))
> return -EFAULT;
>
> - if (use_cpu_reloc(obj))
> - ret = relocate_entry_cpu(obj, reloc, cache, target_offset);
> - else if (obj->map_and_fenceable)
> - ret = relocate_entry_gtt(obj, reloc, cache, target_offset);
> - else if (static_cpu_has(X86_FEATURE_CLFLUSH))
> - ret = relocate_entry_clflush(obj, reloc, cache, target_offset);
> - else {
> - WARN_ONCE(1, "Impossible case in relocation handling\n");
> - ret = -ENODEV;
> - }
> -
> + ret = relocate_entry(obj, reloc, cache, target_offset);
> if (ret)
> return ret;
>
> /* and update the user's relocation entry */
> reloc->presumed_offset = target_offset;
> -
> return 0;
> }
>
> @@ -609,7 +620,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
> int remain, ret = 0;
>
> user_relocs = u64_to_user_ptr(entry->relocs_ptr);
> - reloc_cache_init(&cache);
> + reloc_cache_init(&cache, eb->i915);
>
> remain = entry->relocation_count;
> while (remain) {
> @@ -658,7 +669,7 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
> struct reloc_cache cache;
> int i, ret = 0;
>
> - reloc_cache_init(&cache);
> + reloc_cache_init(&cache, eb->i915);
> for (i = 0; i < entry->relocation_count; i++) {
> ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache);
> if (ret)
> @@ -1073,8 +1084,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
> if (flush_chipset)
> i915_gem_chipset_flush(req->engine->i915);
>
> - if (flush_domains & I915_GEM_DOMAIN_GTT)
> - wmb();
> + /* Make sure (untracked) CPU relocs/parsing are flushed */
> + wmb();
>
> /* Unconditionally invalidate gpu caches and TLBs. */
> return req->engine->emit_flush(req, I915_GEM_GPU_DOMAINS, 0);
> @@ -1606,7 +1617,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>
> memset(¶ms_master, 0x00, sizeof(params_master));
>
> - eb = eb_create(args);
> + eb = eb_create(dev_priv, args);
> if (eb == NULL) {
> i915_gem_context_put(ctx);
> mutex_unlock(&dev->struct_mutex);
--
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
More information about the Intel-gfx
mailing list