[Intel-gfx] [PATCH 2/2] drm/i915: Perform object clflushing asynchronously

Fri Feb 17 14:07:15 UTC 2017

Flushing the cachelines for an object is slow, can be as much as 100ms
for a large framebuffer. We currently do this under the struct_mutex BKL
on execution or on pageflip. But now with the ability to add fences to
obj->resv for both flips and execbuf (and we naturally wait on the fence
before CPU access), we can move the clflush operation to a workqueue and
signal a fence for completion, thereby doing the work asynchronously and
not blocking the driver or its clients.

Suggested-by: Akash Goel <akash.goel at intel.com>
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Matthew Auld <matthew.auld at intel.com>
---
 drivers/gpu/drm/i915/Makefile              |   1 +
 drivers/gpu/drm/i915/i915_drv.h            |   8 +-
 drivers/gpu/drm/i915/i915_gem.c            |  66 ++--------
 drivers/gpu/drm/i915/i915_gem_clflush.c    | 194 +++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   8 +-
 drivers/gpu/drm/i915/intel_display.c       |  49 ++++----
 6 files changed, 240 insertions(+), 86 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_gem_clflush.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 1490d8622234..b1b580337c7a 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -29,6 +29,7 @@ i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o intel_pipe_crc.o
 # GEM code
 i915-y += i915_cmd_parser.o \
 	  i915_gem_batch_pool.o \
+	  i915_gem_clflush.o \
 	  i915_gem_context.o \
 	  i915_gem_dmabuf.o \
 	  i915_gem_evict.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5005922f267b..97d3ecbcf8d4 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3369,7 +3369,13 @@ int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
 void i915_gem_reset(struct drm_i915_private *dev_priv);
 void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
-void i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
+
+void i915_gem_clflush_init(struct drm_i915_private *i915);
+void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
+			     unsigned int flags);
+#define I915_CLFLUSH_FORCE BIT(0)
+#define I915_CLFLUSH_SYNC BIT(1)
+
 void i915_gem_init_mmio(struct drm_i915_private *i915);
 int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
 int __must_check i915_gem_init_hw(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 96098a7e0bc3..fc11f5205e00 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1621,23 +1621,17 @@ i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
 {
 	struct drm_i915_gem_sw_finish *args = data;
 	struct drm_i915_gem_object *obj;
-	int err = 0;
 
 	obj = i915_gem_object_lookup(file, args->handle);
 	if (!obj)
 		return -ENOENT;
 
 	/* Pinned buffers may be scanout, so flush the cache */
-	if (READ_ONCE(obj->pin_display)) {
-		err = i915_mutex_lock_interruptible(dev);
-		if (!err) {
-			i915_gem_object_flush_cpu_write_domain(obj);
-			mutex_unlock(&dev->struct_mutex);
-		}
-	}
+	if (READ_ONCE(obj->pin_display))
+		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
 
 	i915_gem_object_put(obj);
-	return err;
+	return 0;
 }
 
 /**
@@ -3155,46 +3149,6 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
 	return 0;
 }
 
-void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
-			     bool force)
-{
-	/* If we don't have a page list set up, then we're not pinned
-	 * to GPU, and we can ignore the cache flush because it'll happen
-	 * again at bind time.
-	 */
-	if (!obj->mm.pages) {
-		GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
-		return;
-	}
-
-	/*
-	 * Stolen memory is always coherent with the GPU as it is explicitly
-	 * marked as wc by the system, or the system is cache-coherent.
-	 * Similarly, we only access struct pages through the CPU cache, so
-	 * anything not backed by physical memory we consider to be always
-	 * coherent and not need clflushing.
-	 */
-	if (!i915_gem_object_has_struct_page(obj))
-		return;
-
-	/* If the GPU is snooping the contents of the CPU cache,
-	 * we do not need to manually clear the CPU cache lines.  However,
-	 * the caches are only snooped when the render cache is
-	 * flushed/invalidated.  As we always have to emit invalidations
-	 * and flushes when moving into and out of the RENDER domain, correct
-	 * snooping behaviour occurs naturally as the result of our domain
-	 * tracking.
-	 */
-	if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) {
-		obj->cache_dirty = true;
-		return;
-	}
-
-	trace_i915_gem_object_clflush(obj);
-	drm_clflush_sg(obj->mm.pages);
-	obj->cache_dirty = false;
-}
-
 /** Flushes the GTT write domain for the object if it's dirty. */
 static void
 i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj)
@@ -3238,8 +3192,7 @@ i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj)
 	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
 		return;
 
-	i915_gem_clflush_object(obj, obj->pin_display);
-	intel_fb_obj_flush(obj, false, ORIGIN_CPU);
+	i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 
 	obj->base.write_domain = 0;
 	trace_i915_gem_object_change_domain(obj,
@@ -3610,10 +3563,8 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 	vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
 
 	/* Treat this as an end-of-frame, like intel_user_framebuffer_dirty() */
-	if (obj->cache_dirty || obj->base.write_domain == I915_GEM_DOMAIN_CPU) {
-		i915_gem_clflush_object(obj, true);
-		intel_fb_obj_flush(obj, false, ORIGIN_DIRTYFB);
-	}
+	if (obj->cache_dirty || obj->base.write_domain == I915_GEM_DOMAIN_CPU)
+		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
 
 	old_write_domain = obj->base.write_domain;
 	old_read_domains = obj->base.read_domains;
@@ -3687,8 +3638,7 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 
 	/* Flush the CPU cache if it's still invalid. */
 	if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) {
-		i915_gem_clflush_object(obj, false);
-
+		i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 		obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
 	}
 
@@ -4560,6 +4510,8 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
 
+	i915_gem_clflush_init(dev_priv);
+
 	if (!i915.enable_execlists) {
 		dev_priv->gt.resume = intel_legacy_submission_resume;
 		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
new file mode 100644
index 000000000000..1a8fc58aa925
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "i915_drv.h"
+#include "intel_frontbuffer.h"
+
+static DEFINE_SPINLOCK(clflush_lock);
+static u64 clflush_context;
+
+struct clflushf {
+	struct dma_fence dma;
+	struct i915_sw_fence wait;
+	struct work_struct work;
+	struct drm_i915_gem_object *obj;
+};
+
+static const char *i915_clflush_get_driver_name(struct dma_fence *fence)
+{
+	return "i915";
+}
+
+static const char *i915_clflush_get_timeline_name(struct dma_fence *fence)
+{
+	return "clflush";
+}
+
+static bool i915_clflush_enable_signaling(struct dma_fence *fence)
+{
+	return true;
+}
+
+static void i915_clflush_release(struct dma_fence *fence)
+{
+	struct clflushf *f = container_of(fence, typeof(*f), dma);
+
+	i915_sw_fence_fini(&f->wait);
+	dma_fence_free(&f->dma);
+}
+
+static const struct dma_fence_ops i915_clflush_ops = {
+	.get_driver_name = i915_clflush_get_driver_name,
+	.get_timeline_name = i915_clflush_get_timeline_name,
+	.enable_signaling = i915_clflush_enable_signaling,
+	.wait = dma_fence_default_wait,
+	.release = i915_clflush_release,
+};
+
+static bool cpu_cache_is_coherent(struct drm_device *dev,
+				  enum i915_cache_level level)
+{
+	return HAS_LLC(to_i915(dev)) || level != I915_CACHE_NONE;
+}
+
+static void __i915_do_clflush(struct drm_i915_gem_object *obj)
+{
+	drm_clflush_sg(obj->mm.pages);
+	obj->cache_dirty = false;
+
+	intel_fb_obj_flush(obj, false, ORIGIN_CPU);
+}
+
+static void i915_clflush_work(struct work_struct *work)
+{
+	struct clflushf *f = container_of(work, typeof(*f), work);
+	struct drm_i915_gem_object *obj = f->obj;
+
+	if (i915_gem_object_pin_pages(obj)) {
+		obj->cache_dirty = true;
+		goto out;
+	}
+
+	__i915_do_clflush(obj);
+
+	i915_gem_object_unpin_pages(obj);
+
+out:
+	i915_gem_object_put(obj);
+
+	dma_fence_signal(&f->dma);
+	dma_fence_put(&f->dma);
+}
+
+static int __i915_sw_fence_call
+i915_clflush_notify(struct i915_sw_fence *fence,
+		    enum i915_sw_fence_notify state)
+{
+	struct clflushf *f = container_of(fence, typeof(*f), wait);
+
+	switch (state) {
+	case FENCE_COMPLETE:
+		schedule_work(&f->work);
+		break;
+
+	case FENCE_FREE:
+		dma_fence_put(&f->dma);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
+			     unsigned int flags)
+{
+	struct clflushf *f;
+
+	/*
+	 * Stolen memory is always coherent with the GPU as it is explicitly
+	 * marked as wc by the system, or the system is cache-coherent.
+	 * Similarly, we only access struct pages through the CPU cache, so
+	 * anything not backed by physical memory we consider to be always
+	 * coherent and not need clflushing.
+	 */
+	if (!i915_gem_object_has_struct_page(obj))
+		return;
+
+	/* If the GPU is snooping the contents of the CPU cache,
+	 * we do not need to manually clear the CPU cache lines.  However,
+	 * the caches are only snooped when the render cache is
+	 * flushed/invalidated.  As we always have to emit invalidations
+	 * and flushes when moving into and out of the RENDER domain, correct
+	 * snooping behaviour occurs naturally as the result of our domain
+	 * tracking.
+	 */
+	if (!(flags & I915_CLFLUSH_FORCE) &&
+	    cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) {
+		obj->cache_dirty = true;
+		return;
+	}
+
+	trace_i915_gem_object_clflush(obj);
+
+	f = NULL;
+	if (!(flags & I915_CLFLUSH_SYNC))
+		f = kmalloc(sizeof(*f), GFP_KERNEL);
+	if (f) {
+		dma_fence_init(&f->dma,
+			       &i915_clflush_ops,
+			       &clflush_lock,
+			       clflush_context,
+			       0);
+		i915_sw_fence_init(&f->wait, i915_clflush_notify);
+
+		f->obj = i915_gem_object_get(obj);
+		INIT_WORK(&f->work, i915_clflush_work);
+
+		/* +1 for scheduled work, +1 for fence */
+		dma_fence_get(&f->dma);
+		dma_fence_get(&f->dma);
+
+		if (i915_sw_fence_await_reservation(&f->wait,
+						    obj->resv, NULL,
+						    false, I915_FENCE_TIMEOUT,
+						    GFP_KERNEL) <= 0)
+			schedule_work(&f->work);
+		i915_sw_fence_commit(&f->wait);
+
+		reservation_object_lock(obj->resv, NULL);
+		reservation_object_add_excl_fence(obj->resv, &f->dma);
+		reservation_object_unlock(obj->resv);
+
+		dma_fence_put(&f->dma);
+	} else if (obj->mm.pages) {
+		__i915_do_clflush(obj);
+	} else {
+		obj->cache_dirty = true;
+	}
+}
+
+void i915_gem_clflush_init(struct drm_i915_private *i915)
+{
+	clflush_context = dma_fence_context_alloc(1);
+}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index da0846fe2ad6..341bbeb010da 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1114,13 +1114,15 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
 			continue;
 
+		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
+			i915_gem_clflush_object(obj, 0);
+			obj->base.write_domain = 0;
+		}
+
 		ret = i915_gem_request_await_object
 			(req, obj, obj->base.pending_write_domain);
 		if (ret)
 			return ret;
-
-		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
-			i915_gem_clflush_object(obj, false);
 	}
 
 	/* Unconditionally flush any chipset caches (for streaming writes). */
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index b05d9c85384b..010375099104 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -13188,6 +13188,29 @@ intel_prepare_plane_fb(struct drm_plane *plane,
 	struct drm_i915_gem_object *old_obj = intel_fb_obj(plane->state->fb);
 	int ret;
 
+	if (obj) {
+		if (plane->type == DRM_PLANE_TYPE_CURSOR &&
+		    INTEL_INFO(dev_priv)->cursor_needs_physical) {
+			const int align = IS_I830(dev_priv) ? 16 * 1024 : 256;
+
+			ret = i915_gem_object_attach_phys(obj, align);
+			if (ret) {
+				DRM_DEBUG_KMS("failed to attach phys object\n");
+				return ret;
+			}
+		} else {
+			struct i915_vma *vma;
+
+			vma = intel_pin_and_fence_fb_obj(fb, new_state->rotation);
+			if (IS_ERR(vma)) {
+				DRM_DEBUG_KMS("failed to pin object\n");
+				return PTR_ERR(vma);
+			}
+
+			to_intel_plane_state(new_state)->vma = vma;
+		}
+	}
+
 	if (!obj && !old_obj)
 		return 0;
 
@@ -13240,26 +13263,6 @@ intel_prepare_plane_fb(struct drm_plane *plane,
 		i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY);
 	}
 
-	if (plane->type == DRM_PLANE_TYPE_CURSOR &&
-	    INTEL_INFO(dev_priv)->cursor_needs_physical) {
-		int align = IS_I830(dev_priv) ? 16 * 1024 : 256;
-		ret = i915_gem_object_attach_phys(obj, align);
-		if (ret) {
-			DRM_DEBUG_KMS("failed to attach phys object\n");
-			return ret;
-		}
-	} else {
-		struct i915_vma *vma;
-
-		vma = intel_pin_and_fence_fb_obj(fb, new_state->rotation);
-		if (IS_ERR(vma)) {
-			DRM_DEBUG_KMS("failed to pin object\n");
-			return PTR_ERR(vma);
-		}
-
-		to_intel_plane_state(new_state)->vma = vma;
-	}
-
 	return 0;
 }
 
@@ -14278,15 +14281,11 @@ static int intel_user_framebuffer_dirty(struct drm_framebuffer *fb,
 					struct drm_clip_rect *clips,
 					unsigned num_clips)
 {
-	struct drm_device *dev = fb->dev;
 	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 	struct drm_i915_gem_object *obj = intel_fb->obj;
 
-	mutex_lock(&dev->struct_mutex);
 	if (obj->pin_display && obj->cache_dirty)
-		i915_gem_clflush_object(obj, true);
-	intel_fb_obj_flush(obj, false, ORIGIN_DIRTYFB);
-	mutex_unlock(&dev->struct_mutex);
+		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
 
 	return 0;
 }
-- 
2.11.0