[Intel-gfx] [PATCH 2/2] drm/i915: Perform object clflushing asynchronously

Matthew Auld matthew.william.auld at gmail.com
Mon Feb 20 18:41:04 UTC 2017


On 17 February 2017 at 14:07, Chris Wilson <chris at chris-wilson.co.uk> wrote:
> Flushing the cachelines for an object is slow, can be as much as 100ms
> for a large framebuffer. We currently do this under the struct_mutex BKL
> on execution or on pageflip. But now with the ability to add fences to
> obj->resv for both flips and execbuf (and we naturally wait on the fence
> before CPU access), we can move the clflush operation to a workqueue and
> signal a fence for completion, thereby doing the work asynchronously and
> not blocking the driver or its clients.
>
> Suggested-by: Akash Goel <akash.goel at intel.com>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Matthew Auld <matthew.auld at intel.com>
> ---
>  drivers/gpu/drm/i915/Makefile              |   1 +
>  drivers/gpu/drm/i915/i915_drv.h            |   8 +-
>  drivers/gpu/drm/i915/i915_gem.c            |  66 ++--------
>  drivers/gpu/drm/i915/i915_gem_clflush.c    | 194 +++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   8 +-
>  drivers/gpu/drm/i915/intel_display.c       |  49 ++++----
>  6 files changed, 240 insertions(+), 86 deletions(-)
>  create mode 100644 drivers/gpu/drm/i915/i915_gem_clflush.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 1490d8622234..b1b580337c7a 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -29,6 +29,7 @@ i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o intel_pipe_crc.o
>  # GEM code
>  i915-y += i915_cmd_parser.o \
>           i915_gem_batch_pool.o \
> +         i915_gem_clflush.o \
>           i915_gem_context.o \
>           i915_gem_dmabuf.o \
>           i915_gem_evict.o \
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5005922f267b..97d3ecbcf8d4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3369,7 +3369,13 @@ int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
>  void i915_gem_reset(struct drm_i915_private *dev_priv);
>  void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
> -void i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
> +
> +void i915_gem_clflush_init(struct drm_i915_private *i915);
> +void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
> +                            unsigned int flags);
> +#define I915_CLFLUSH_FORCE BIT(0)
> +#define I915_CLFLUSH_SYNC BIT(1)
> +
>  void i915_gem_init_mmio(struct drm_i915_private *i915);
>  int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
>  int __must_check i915_gem_init_hw(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 96098a7e0bc3..fc11f5205e00 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1621,23 +1621,17 @@ i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
>  {
>         struct drm_i915_gem_sw_finish *args = data;
>         struct drm_i915_gem_object *obj;
> -       int err = 0;
>
>         obj = i915_gem_object_lookup(file, args->handle);
>         if (!obj)
>                 return -ENOENT;
>
>         /* Pinned buffers may be scanout, so flush the cache */
> -       if (READ_ONCE(obj->pin_display)) {
> -               err = i915_mutex_lock_interruptible(dev);
> -               if (!err) {
> -                       i915_gem_object_flush_cpu_write_domain(obj);
> -                       mutex_unlock(&dev->struct_mutex);
> -               }
> -       }
> +       if (READ_ONCE(obj->pin_display))
> +               i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
>
>         i915_gem_object_put(obj);
> -       return err;
> +       return 0;
>  }
>
>  /**
> @@ -3155,46 +3149,6 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>         return 0;
>  }
>
> -void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
> -                            bool force)
> -{
> -       /* If we don't have a page list set up, then we're not pinned
> -        * to GPU, and we can ignore the cache flush because it'll happen
> -        * again at bind time.
> -        */
> -       if (!obj->mm.pages) {
> -               GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
> -               return;
> -       }
> -
> -       /*
> -        * Stolen memory is always coherent with the GPU as it is explicitly
> -        * marked as wc by the system, or the system is cache-coherent.
> -        * Similarly, we only access struct pages through the CPU cache, so
> -        * anything not backed by physical memory we consider to be always
> -        * coherent and not need clflushing.
> -        */
> -       if (!i915_gem_object_has_struct_page(obj))
> -               return;
> -
> -       /* If the GPU is snooping the contents of the CPU cache,
> -        * we do not need to manually clear the CPU cache lines.  However,
> -        * the caches are only snooped when the render cache is
> -        * flushed/invalidated.  As we always have to emit invalidations
> -        * and flushes when moving into and out of the RENDER domain, correct
> -        * snooping behaviour occurs naturally as the result of our domain
> -        * tracking.
> -        */
> -       if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) {
> -               obj->cache_dirty = true;
> -               return;
> -       }
> -
> -       trace_i915_gem_object_clflush(obj);
> -       drm_clflush_sg(obj->mm.pages);
> -       obj->cache_dirty = false;
> -}
> -
>  /** Flushes the GTT write domain for the object if it's dirty. */
>  static void
>  i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj)
> @@ -3238,8 +3192,7 @@ i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj)
>         if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
>                 return;
>
> -       i915_gem_clflush_object(obj, obj->pin_display);
> -       intel_fb_obj_flush(obj, false, ORIGIN_CPU);
> +       i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
>
>         obj->base.write_domain = 0;
>         trace_i915_gem_object_change_domain(obj,
> @@ -3610,10 +3563,8 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
>         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
>
>         /* Treat this as an end-of-frame, like intel_user_framebuffer_dirty() */
> -       if (obj->cache_dirty || obj->base.write_domain == I915_GEM_DOMAIN_CPU) {
> -               i915_gem_clflush_object(obj, true);
> -               intel_fb_obj_flush(obj, false, ORIGIN_DIRTYFB);
> -       }
> +       if (obj->cache_dirty || obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> +               i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
>
>         old_write_domain = obj->base.write_domain;
>         old_read_domains = obj->base.read_domains;
> @@ -3687,8 +3638,7 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
>
>         /* Flush the CPU cache if it's still invalid. */
>         if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) {
> -               i915_gem_clflush_object(obj, false);
> -
> +               i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
>                 obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
>         }
>
> @@ -4560,6 +4510,8 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>
>         mutex_lock(&dev_priv->drm.struct_mutex);
>
> +       i915_gem_clflush_init(dev_priv);
> +
>         if (!i915.enable_execlists) {
>                 dev_priv->gt.resume = intel_legacy_submission_resume;
>                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
> diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
> new file mode 100644
> index 000000000000..1a8fc58aa925
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
> @@ -0,0 +1,194 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include "i915_drv.h"
> +#include "intel_frontbuffer.h"
> +
> +static DEFINE_SPINLOCK(clflush_lock);
> +static u64 clflush_context;
> +
> +struct clflushf {
> +       struct dma_fence dma;
> +       struct i915_sw_fence wait;
> +       struct work_struct work;
> +       struct drm_i915_gem_object *obj;
> +};
> +
> +static const char *i915_clflush_get_driver_name(struct dma_fence *fence)
> +{
> +       return "i915";
> +}
> +
> +static const char *i915_clflush_get_timeline_name(struct dma_fence *fence)
> +{
> +       return "clflush";
> +}
> +
> +static bool i915_clflush_enable_signaling(struct dma_fence *fence)
> +{
> +       return true;
> +}
> +
> +static void i915_clflush_release(struct dma_fence *fence)
> +{
> +       struct clflushf *f = container_of(fence, typeof(*f), dma);
> +
> +       i915_sw_fence_fini(&f->wait);
> +       dma_fence_free(&f->dma);
Don't we need a kfree(f) somewhere?

Also CI didn't look very happy, is this series meant to be standalone,
or is there something else waiting in the wings?


More information about the Intel-gfx mailing list