[Intel-gfx] [PATCH 9/9] drm/i915/cmdparser: Accelerate copies from WC memory
Matthew Auld
matthew.william.auld at gmail.com
Wed Aug 17 16:33:35 UTC 2016
On 12 August 2016 at 16:07, Chris Wilson <chris at chris-wilson.co.uk> wrote:
> If we need to use clflush to prepare our batch for reads from memory, we
> can bypass the cache instead by using non-temporal copies.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_cmd_parser.c | 58 ++++++++++++++++++++++------------
> drivers/gpu/drm/i915/i915_debugfs.c | 24 --------------
> drivers/gpu/drm/i915/i915_drv.c | 19 -----------
> drivers/gpu/drm/i915/i915_gem.c | 48 ++++++++++++++++------------
> drivers/gpu/drm/i915/i915_gem_gtt.c | 17 +++++++---
> drivers/gpu/drm/i915/i915_gem_tiling.c | 4 ---
> drivers/gpu/drm/i915/i915_irq.c | 2 --
> drivers/gpu/drm/i915/intel_uncore.c | 6 ++--
> 8 files changed, 81 insertions(+), 97 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index cea3ef7299cc..3244ef1401ad 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -969,8 +969,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
> {
> unsigned int src_needs_clflush;
> unsigned int dst_needs_clflush;
> - void *dst, *ptr;
> - int offset, n;
> + void *dst;
> int ret;
>
> ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
> @@ -987,24 +986,43 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
> if (IS_ERR(dst))
> goto unpin_dst;
>
> - ptr = dst;
> - offset = offset_in_page(batch_start_offset);
> - if (dst_needs_clflush & CLFLUSH_BEFORE)
> - batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
> -
> - for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
> - int len = min_t(int, batch_len, PAGE_SIZE - offset);
> - void *vaddr;
> -
> - vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
> - if (src_needs_clflush)
> - drm_clflush_virt_range(vaddr + offset, len);
> - memcpy(ptr, vaddr + offset, len);
> - kunmap_atomic(vaddr);
> -
> - ptr += len;
> - batch_len -= len;
> - offset = 0;
> + if (src_needs_clflush &&
> + i915_memcpy_from_wc((void *)(uintptr_t)batch_start_offset, 0, 0)) {
> + void *src;
> +
> + src = i915_gem_object_pin_map(src_obj, I915_MAP_WC);
> + if (IS_ERR(src))
> + goto shmem_copy;
> +
> + i915_memcpy_from_wc(dst,
> + src + batch_start_offset,
> + ALIGN(batch_len, 16));
> + i915_gem_object_unpin_map(src_obj);
> + } else {
> + void *ptr;
> + int offset, n;
> +
> +shmem_copy:
I think Joonas may shed another tear at the sight of this :)
> + offset = offset_in_page(batch_start_offset);
> + if (dst_needs_clflush & CLFLUSH_BEFORE)
> + batch_len = roundup(batch_len,
> + boot_cpu_data.x86_clflush_size);
> +
> + ptr = dst;
> + for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
> + int len = min_t(int, batch_len, PAGE_SIZE - offset);
> + void *vaddr;
> +
> + vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
> + if (src_needs_clflush)
> + drm_clflush_virt_range(vaddr + offset, len);
> + memcpy(ptr, vaddr + offset, len);
> + kunmap_atomic(vaddr);
> +
> + ptr += len;
> + batch_len -= len;
> + offset = 0;
> + }
> }
>
Disregarding the rest, which seems unrelated to this patch.
Reviewed-by: Matthew Auld <matthew.auld at intel.com>
> /* dst_obj is returned with vmap pinned */
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 2fe88d930ca7..8dcdc27afe80 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -715,18 +715,13 @@ static int i915_gem_seqno_info(struct seq_file *m, void *data)
> struct drm_device *dev = node->minor->dev;
> struct drm_i915_private *dev_priv = to_i915(dev);
> struct intel_engine_cs *engine;
> - int ret;
>
> - ret = mutex_lock_interruptible(&dev->struct_mutex);
> - if (ret)
> - return ret;
> intel_runtime_pm_get(dev_priv);
>
> for_each_engine(engine, dev_priv)
> i915_ring_seqno_info(m, engine);
>
> intel_runtime_pm_put(dev_priv);
> - mutex_unlock(&dev->struct_mutex);
>
> return 0;
> }
> @@ -1379,11 +1374,7 @@ static int ironlake_drpc_info(struct seq_file *m)
> struct drm_i915_private *dev_priv = to_i915(dev);
> u32 rgvmodectl, rstdbyctl;
> u16 crstandvid;
> - int ret;
>
> - ret = mutex_lock_interruptible(&dev->struct_mutex);
> - if (ret)
> - return ret;
> intel_runtime_pm_get(dev_priv);
>
> rgvmodectl = I915_READ(MEMMODECTL);
> @@ -1391,7 +1382,6 @@ static int ironlake_drpc_info(struct seq_file *m)
> crstandvid = I915_READ16(CRSTANDVID);
>
> intel_runtime_pm_put(dev_priv);
> - mutex_unlock(&dev->struct_mutex);
>
> seq_printf(m, "HD boost: %s\n", yesno(rgvmodectl & MEMMODE_BOOST_EN));
> seq_printf(m, "Boost freq: %d\n",
> @@ -2179,11 +2169,7 @@ static int i915_swizzle_info(struct seq_file *m, void *data)
> struct drm_info_node *node = m->private;
> struct drm_device *dev = node->minor->dev;
> struct drm_i915_private *dev_priv = to_i915(dev);
> - int ret;
>
> - ret = mutex_lock_interruptible(&dev->struct_mutex);
> - if (ret)
> - return ret;
> intel_runtime_pm_get(dev_priv);
>
> seq_printf(m, "bit6 swizzle for X-tiling = %s\n",
> @@ -2223,7 +2209,6 @@ static int i915_swizzle_info(struct seq_file *m, void *data)
> seq_puts(m, "L-shaped memory detected\n");
>
> intel_runtime_pm_put(dev_priv);
> - mutex_unlock(&dev->struct_mutex);
>
> return 0;
> }
> @@ -4729,13 +4714,9 @@ i915_wedged_set(void *data, u64 val)
> if (i915_reset_in_progress(&dev_priv->gpu_error))
> return -EAGAIN;
>
> - intel_runtime_pm_get(dev_priv);
> -
> i915_handle_error(dev_priv, val,
> "Manually setting wedged to %llu", val);
>
> - intel_runtime_pm_put(dev_priv);
> -
> return 0;
> }
>
> @@ -4976,20 +4957,15 @@ i915_cache_sharing_get(void *data, u64 *val)
> struct drm_device *dev = data;
> struct drm_i915_private *dev_priv = to_i915(dev);
> u32 snpcr;
> - int ret;
>
> if (!(IS_GEN6(dev) || IS_GEN7(dev)))
> return -ENODEV;
>
> - ret = mutex_lock_interruptible(&dev->struct_mutex);
> - if (ret)
> - return ret;
> intel_runtime_pm_get(dev_priv);
>
> snpcr = I915_READ(GEN6_MBCUNIT_SNPCR);
>
> intel_runtime_pm_put(dev_priv);
> - mutex_unlock(&dev_priv->drm.struct_mutex);
>
> *val = (snpcr & GEN6_MBC_SNPCR_MASK) >> GEN6_MBC_SNPCR_SHIFT;
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index c040c6329804..b458faa0d349 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -2293,24 +2293,6 @@ static int intel_runtime_suspend(struct device *device)
>
> DRM_DEBUG_KMS("Suspending device\n");
>
> - /*
> - * We could deadlock here in case another thread holding struct_mutex
> - * calls RPM suspend concurrently, since the RPM suspend will wait
> - * first for this RPM suspend to finish. In this case the concurrent
> - * RPM resume will be followed by its RPM suspend counterpart. Still
> - * for consistency return -EAGAIN, which will reschedule this suspend.
> - */
> - if (!mutex_trylock(&dev->struct_mutex)) {
> - DRM_DEBUG_KMS("device lock contention, deffering suspend\n");
> - /*
> - * Bump the expiration timestamp, otherwise the suspend won't
> - * be rescheduled.
> - */
> - pm_runtime_mark_last_busy(device);
> -
> - return -EAGAIN;
> - }
> -
> disable_rpm_wakeref_asserts(dev_priv);
>
> /*
> @@ -2318,7 +2300,6 @@ static int intel_runtime_suspend(struct device *device)
> * an RPM reference.
> */
> i915_gem_release_all_mmaps(dev_priv);
> - mutex_unlock(&dev->struct_mutex);
>
> intel_guc_suspend(dev);
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 5c1acfc10bc4..a26bfd7d6aab 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1434,11 +1434,9 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
> if (ret)
> goto err;
>
> - intel_runtime_pm_get(dev_priv);
> -
> ret = i915_mutex_lock_interruptible(dev);
> if (ret)
> - goto err_rpm;
> + goto err;
>
> ret = -EFAULT;
> /* We can only do the GTT pwrite on untiled buffers, as otherwise
> @@ -1449,7 +1447,9 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
> */
> if (!i915_gem_object_has_struct_page(obj) ||
> cpu_write_needs_clflush(obj)) {
> + intel_runtime_pm_get(dev_priv);
> ret = i915_gem_gtt_pwrite_fast(dev_priv, obj, args, file);
> + intel_runtime_pm_put(dev_priv);
> /* Note that the gtt paths might fail with non-page-backed user
> * pointers (e.g. gtt mappings when moving data between
> * textures). Fallback to the shmem path in that case. */
> @@ -1464,12 +1464,8 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>
> i915_gem_object_put(obj);
> mutex_unlock(&dev->struct_mutex);
> - intel_runtime_pm_put(dev_priv);
> -
> return ret;
>
> -err_rpm:
> - intel_runtime_pm_put(dev_priv);
> err:
> i915_gem_object_put_unlocked(obj);
> return ret;
> @@ -1833,9 +1829,13 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
> /* Serialisation between user GTT access and our code depends upon
> * revoking the CPU's PTE whilst the mutex is held. The next user
> * pagefault then has to wait until we release the mutex.
> + *
> + * Note that RPM complicates somewhat by adding an additional
> + * requirement that operations to the GGTT be made holding the RPM
> + * wakeref. This in turns allow us to release the mmap from within
> + * the RPM suspend code ignoring the struct_mutex serialisation in
> + * lieu of the RPM barriers.
> */
> - lockdep_assert_held(&obj->base.dev->struct_mutex);
> -
> if (!obj->fault_mappable)
> return;
>
> @@ -1854,11 +1854,21 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
> obj->fault_mappable = false;
> }
>
> +static void assert_rpm_release_all_mmaps(struct drm_i915_private *dev_priv)
> +{
> + assert_rpm_wakelock_held(dev_priv);
> +}
> +
> void
> i915_gem_release_all_mmaps(struct drm_i915_private *dev_priv)
> {
> struct drm_i915_gem_object *obj;
>
> + /* This should only be called by RPM as we require the bound_list
> + * to be protected by the RPM barriers and not struct_mutex.
> + * We check that we are holding the wakeref whenever we manipulate
> + * the dev_priv->mm.bound_list (via assert_rpm_release_all_mmaps).
> + */
> list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list)
> i915_gem_release_mmap(obj);
> }
> @@ -2402,9 +2412,11 @@ i915_gem_object_retire__read(struct i915_gem_active *active,
> * so that we don't steal from recently used but inactive objects
> * (unless we are forced to ofc!)
> */
> - if (obj->bind_count)
> + if (obj->bind_count) {
> + assert_rpm_release_all_mmaps(request->i915);
> list_move_tail(&obj->global_list,
> &request->i915->mm.bound_list);
> + }
>
> if (i915_gem_object_has_active_reference(obj)) {
> i915_gem_object_clear_active_reference(obj);
> @@ -2881,9 +2893,11 @@ int i915_vma_unbind(struct i915_vma *vma)
>
> /* Since the unbound list is global, only move to that list if
> * no more VMAs exist. */
> - if (--obj->bind_count == 0)
> + if (--obj->bind_count == 0) {
> + assert_rpm_release_all_mmaps(to_i915(obj->base.dev));
> list_move_tail(&obj->global_list,
> &to_i915(obj->base.dev)->mm.unbound_list);
> + }
>
> /* And finally now the object is completely decoupled from this vma,
> * we can drop its hold on the backing storage and allow it to be
> @@ -3071,6 +3085,7 @@ search_free:
> }
> GEM_BUG_ON(!i915_gem_valid_gtt_space(vma, obj->cache_level));
>
> + assert_rpm_release_all_mmaps(dev_priv);
> list_move_tail(&obj->global_list, &dev_priv->mm.bound_list);
> list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
> obj->bind_count++;
> @@ -3420,7 +3435,6 @@ int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
> int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file)
> {
> - struct drm_i915_private *dev_priv = to_i915(dev);
> struct drm_i915_gem_caching *args = data;
> struct drm_i915_gem_object *obj;
> enum i915_cache_level level;
> @@ -3449,11 +3463,9 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
> return -EINVAL;
> }
>
> - intel_runtime_pm_get(dev_priv);
> -
> ret = i915_mutex_lock_interruptible(dev);
> if (ret)
> - goto rpm_put;
> + return ret;
>
> obj = i915_gem_object_lookup(file, args->handle);
> if (!obj) {
> @@ -3462,13 +3474,9 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
> }
>
> ret = i915_gem_object_set_cache_level(obj, level);
> -
> i915_gem_object_put(obj);
> unlock:
> mutex_unlock(&dev->struct_mutex);
> -rpm_put:
> - intel_runtime_pm_put(dev_priv);
> -
> return ret;
> }
>
> @@ -4174,8 +4182,6 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
>
> kfree(obj->bit_17);
> i915_gem_object_free(obj);
> -
> - intel_runtime_pm_put(dev_priv);
> }
>
> void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index fe7f9887ee67..67a3ff960b0d 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2594,6 +2594,7 @@ static int ggtt_bind_vma(struct i915_vma *vma,
> enum i915_cache_level cache_level,
> u32 flags)
> {
> + struct drm_i915_private *i915 = to_i915(vma->vm->dev);
> struct drm_i915_gem_object *obj = vma->obj;
> u32 pte_flags = 0;
> int ret;
> @@ -2606,8 +2607,10 @@ static int ggtt_bind_vma(struct i915_vma *vma,
> if (obj->gt_ro)
> pte_flags |= PTE_READ_ONLY;
>
> + intel_runtime_pm_get(i915);
> vma->vm->insert_entries(vma->vm, vma->pages, vma->node.start,
> cache_level, pte_flags);
> + intel_runtime_pm_get(i915);
>
> /*
> * Without aliasing PPGTT there's no difference between
> @@ -2623,6 +2626,7 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
> enum i915_cache_level cache_level,
> u32 flags)
> {
> + struct drm_i915_private *i915 = to_i915(vma->vm->dev);
> u32 pte_flags;
> int ret;
>
> @@ -2637,14 +2641,15 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
>
>
> if (flags & I915_VMA_GLOBAL_BIND) {
> + intel_runtime_pm_get(i915);
> vma->vm->insert_entries(vma->vm,
> vma->pages, vma->node.start,
> cache_level, pte_flags);
> + intel_runtime_pm_put(i915);
> }
>
> if (flags & I915_VMA_LOCAL_BIND) {
> - struct i915_hw_ppgtt *appgtt =
> - to_i915(vma->vm->dev)->mm.aliasing_ppgtt;
> + struct i915_hw_ppgtt *appgtt = i915->mm.aliasing_ppgtt;
> appgtt->base.insert_entries(&appgtt->base,
> vma->pages, vma->node.start,
> cache_level, pte_flags);
> @@ -2655,13 +2660,17 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
>
> static void ggtt_unbind_vma(struct i915_vma *vma)
> {
> - struct i915_hw_ppgtt *appgtt = to_i915(vma->vm->dev)->mm.aliasing_ppgtt;
> + struct drm_i915_private *i915 = to_i915(vma->vm->dev);
> + struct i915_hw_ppgtt *appgtt = i915->mm.aliasing_ppgtt;
> const u64 size = min(vma->size, vma->node.size);
>
> - if (vma->flags & I915_VMA_GLOBAL_BIND)
> + if (vma->flags & I915_VMA_GLOBAL_BIND) {
> + intel_runtime_pm_get(i915);
> vma->vm->clear_range(vma->vm,
> vma->node.start, size,
> true);
> + intel_runtime_pm_put(i915);
> + }
>
> if (vma->flags & I915_VMA_LOCAL_BIND && appgtt)
> appgtt->base.clear_range(&appgtt->base,
> diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
> index a14b1e3d4c78..08f796a4f5f6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_tiling.c
> +++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
> @@ -204,8 +204,6 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
> return -EINVAL;
> }
>
> - intel_runtime_pm_get(dev_priv);
> -
> mutex_lock(&dev->struct_mutex);
> if (obj->pin_display || obj->framebuffer_references) {
> err = -EBUSY;
> @@ -301,8 +299,6 @@ err:
> i915_gem_object_put(obj);
> mutex_unlock(&dev->struct_mutex);
>
> - intel_runtime_pm_put(dev_priv);
> -
> return err;
> }
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index ebb83d5a448b..3d9c2a21dfbd 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2523,7 +2523,6 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
> * simulated reset via debugs, so get an RPM reference.
> */
> intel_runtime_pm_get(dev_priv);
> -
> intel_prepare_reset(dev_priv);
>
> /*
> @@ -2535,7 +2534,6 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
> ret = i915_reset(dev_priv);
>
> intel_finish_reset(dev_priv);
> -
> intel_runtime_pm_put(dev_priv);
>
> if (ret == 0)
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 43f833901b8e..a6b04da4bf21 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1414,7 +1414,7 @@ int i915_reg_read_ioctl(struct drm_device *dev,
> struct register_whitelist const *entry = whitelist;
> unsigned size;
> i915_reg_t offset_ldw, offset_udw;
> - int i, ret = 0;
> + int i, ret;
>
> for (i = 0; i < ARRAY_SIZE(whitelist); i++, entry++) {
> if (i915_mmio_reg_offset(entry->offset_ldw) == (reg->offset & -entry->size) &&
> @@ -1436,6 +1436,7 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>
> intel_runtime_pm_get(dev_priv);
>
> + ret = 0;
> switch (size) {
> case 8 | 1:
> reg->val = I915_READ64_2x32(offset_ldw, offset_udw);
> @@ -1454,10 +1455,9 @@ int i915_reg_read_ioctl(struct drm_device *dev,
> break;
> default:
> ret = -EINVAL;
> - goto out;
> + break;
> }
>
> -out:
> intel_runtime_pm_put(dev_priv);
> return ret;
> }
> --
> 2.8.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list