[Intel-gfx] [PATCH] drm/i915: Perform object clflushing asynchronously
Chris Wilson
chris at chris-wilson.co.uk
Fri Nov 4 20:03:57 UTC 2016
Flushing the cachelines for an object is slow, can be as much as 100ms
for a large framebuffer. We currently do this under the struct_mutex BKL
on execution or on pageflip. But now with the ability to add fences to
obj->resv for both flips and execbuf (and we naturally wait on the fence
before CPU access), we can move the clflush operation to a workqueue and
signal a fence for completion, thereby doing the work asynchronously and
not blocking the driver or its clients.
Suggested-by: Akash Goel <akash.goel at intel.com>
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Akash Goel <akash.goel at intel.com>
---
drivers/gpu/drm/i915/Makefile | 1 +
drivers/gpu/drm/i915/i915_drv.h | 8 +-
drivers/gpu/drm/i915/i915_gem.c | 60 +++----------
drivers/gpu/drm/i915/i915_gem_clflush.c | 138 +++++++++++++++++++++++++++++
drivers/gpu/drm/i915/i915_gem_execbuffer.c | 6 +-
drivers/gpu/drm/i915/intel_display.c | 57 ++++++------
6 files changed, 190 insertions(+), 80 deletions(-)
create mode 100644 drivers/gpu/drm/i915/i915_gem_clflush.c
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 0857e5035f4d..6afd402e440b 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -29,6 +29,7 @@ i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o
# GEM code
i915-y += i915_cmd_parser.o \
i915_gem_batch_pool.o \
+ i915_gem_clflush.o \
i915_gem_context.o \
i915_gem_dmabuf.o \
i915_gem_evict.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2754d5de76af..c80044267333 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3410,7 +3410,13 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
void i915_gem_reset(struct drm_i915_private *dev_priv);
void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
-bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
+
+void i915_gem_clflush_init(struct drm_i915_private *i915);
+int i915_gem_clflush_object(struct drm_i915_gem_object *obj,
+ unsigned int flags);
+#define I915_CLFLUSH_FORCE BIT(0)
+#define I915_CLFLUSH_SYNC BIT(1)
+
int __must_check i915_gem_init(struct drm_device *dev);
int __must_check i915_gem_init_hw(struct drm_device *dev);
void i915_gem_init_swizzling(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cffe60237b6a..524f72774537 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -230,7 +230,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj)
obj->mm.dirty = false;
if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0)
- i915_gem_clflush_object(obj, false);
+ i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
@@ -1570,6 +1570,11 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
mutex_unlock(&dev->struct_mutex);
+ if (err == 0)
+ err = i915_gem_object_wait(obj,
+ I915_WAIT_INTERRUPTIBLE,
+ MAX_SCHEDULE_TIMEOUT,
+ NULL);
if (write_domain != 0)
intel_fb_obj_invalidate(obj, write_origin(obj, write_domain));
@@ -3236,44 +3241,6 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
return ret;
}
-bool
-i915_gem_clflush_object(struct drm_i915_gem_object *obj,
- bool force)
-{
- /* If we don't have a page list set up, then we're not pinned
- * to GPU, and we can ignore the cache flush because it'll happen
- * again at bind time.
- */
- if (!obj->mm.pages)
- return false;
-
- /*
- * Stolen memory is always coherent with the GPU as it is explicitly
- * marked as wc by the system, or the system is cache-coherent.
- */
- if (obj->stolen || obj->phys_handle)
- return false;
-
- /* If the GPU is snooping the contents of the CPU cache,
- * we do not need to manually clear the CPU cache lines. However,
- * the caches are only snooped when the render cache is
- * flushed/invalidated. As we always have to emit invalidations
- * and flushes when moving into and out of the RENDER domain, correct
- * snooping behaviour occurs naturally as the result of our domain
- * tracking.
- */
- if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) {
- obj->cache_dirty = true;
- return false;
- }
-
- trace_i915_gem_object_clflush(obj);
- drm_clflush_sg(obj->mm.pages);
- obj->cache_dirty = false;
-
- return true;
-}
-
/** Flushes the GTT write domain for the object if it's dirty. */
static void
i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj)
@@ -3317,9 +3284,7 @@ i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj)
if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
return;
- if (i915_gem_clflush_object(obj, obj->pin_display))
- i915_gem_chipset_flush(to_i915(obj->base.dev));
-
+ i915_gem_clflush_object(obj, obj->pin_display);
intel_fb_obj_flush(obj, false, ORIGIN_CPU);
obj->base.write_domain = 0;
@@ -3526,10 +3491,8 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
* object is now coherent at its new cache level (with respect
* to the access domain).
*/
- if (obj->cache_dirty && cpu_write_needs_clflush(obj)) {
- if (i915_gem_clflush_object(obj, true))
- i915_gem_chipset_flush(to_i915(obj->base.dev));
- }
+ if (obj->cache_dirty && cpu_write_needs_clflush(obj))
+ i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
return 0;
}
@@ -3759,8 +3722,7 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
/* Flush the CPU cache if it's still invalid. */
if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) {
- i915_gem_clflush_object(obj, false);
-
+ i915_gem_clflush_object(obj, 0);
obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
}
@@ -4815,6 +4777,8 @@ int i915_gem_init(struct drm_device *dev)
mutex_lock(&dev->struct_mutex);
+ i915_gem_clflush_init(dev_priv);
+
if (!i915.enable_execlists) {
dev_priv->gt.resume = intel_legacy_submission_resume;
dev_priv->gt.cleanup_engine = intel_engine_cleanup;
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
new file mode 100644
index 000000000000..aae2155a51a8
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "i915_drv.h"
+
+static DEFINE_SPINLOCK(clflush_lock);
+static u64 clflush_context;
+
+struct clflush_fence {
+ struct dma_fence dma;
+ struct work_struct work;
+ struct drm_i915_gem_object *obj;
+};
+
+static const char *i915_clflush_get_driver_name(struct dma_fence *fence)
+{
+ return "i915";
+}
+
+static const char *i915_clflush_get_timeline_name(struct dma_fence *fence)
+{
+ return "clflush";
+}
+
+static bool i915_clflush_enable_signaling(struct dma_fence *fence)
+{
+ return true;
+}
+
+const struct dma_fence_ops i915_clflush_ops = {
+ .get_driver_name = i915_clflush_get_driver_name,
+ .get_timeline_name = i915_clflush_get_timeline_name,
+ .enable_signaling = i915_clflush_enable_signaling,
+ .wait = dma_fence_default_wait,
+};
+
+static bool cpu_cache_is_coherent(struct drm_device *dev,
+ enum i915_cache_level level)
+{
+ return HAS_LLC(dev) || level != I915_CACHE_NONE;
+}
+
+static void i915_clflush_work(struct work_struct *work)
+{
+ struct clflush_fence *fence = container_of(work, typeof(*fence), work);
+
+ drm_clflush_sg(fence->obj->mm.pages);
+ i915_gem_object_put(fence->obj);
+
+ dma_fence_signal(&fence->dma);
+ dma_fence_put(&fence->dma);
+}
+
+int i915_gem_clflush_object(struct drm_i915_gem_object *obj,
+ unsigned int flags)
+{
+ struct clflush_fence *fence;
+
+ /* If we don't have a page list set up, then we're not pinned
+ * to GPU, and we can ignore the cache flush because it'll happen
+ * again at bind time.
+ */
+ if (!obj->mm.pages)
+ return false;
+
+ /*
+ * Stolen memory is always coherent with the GPU as it is explicitly
+ * marked as wc by the system, or the system is cache-coherent.
+ */
+ if (obj->stolen || obj->phys_handle)
+ return false;
+
+ /* If the GPU is snooping the contents of the CPU cache,
+ * we do not need to manually clear the CPU cache lines. However,
+ * the caches are only snooped when the render cache is
+ * flushed/invalidated. As we always have to emit invalidations
+ * and flushes when moving into and out of the RENDER domain, correct
+ * snooping behaviour occurs naturally as the result of our domain
+ * tracking.
+ */
+ if (!(flags & I915_CLFLUSH_FORCE) &&
+ cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) {
+ obj->cache_dirty = true;
+ return false;
+ }
+
+ trace_i915_gem_object_clflush(obj);
+
+ fence = NULL;
+ ww_mutex_lock(&obj->resv->lock, NULL);
+ if (!(flags & I915_CLFLUSH_SYNC) &&
+ reservation_object_reserve_shared(obj->resv) == 0)
+ fence = kmalloc(sizeof(*fence), GFP_KERNEL);
+ if (fence) {
+ dma_fence_init(&fence->dma,
+ &i915_clflush_ops,
+ &clflush_lock,
+ clflush_context,
+ 0);
+
+ fence->obj = i915_gem_object_get(obj);
+ INIT_WORK(&fence->work, i915_clflush_work);
+ schedule_work(&fence->work);
+
+ reservation_object_add_shared_fence(obj->resv, &fence->dma);
+ } else {
+ drm_clflush_sg(obj->mm.pages);
+ }
+ ww_mutex_unlock(&obj->resv->lock);
+ obj->cache_dirty = false;
+ return 0;
+}
+
+void i915_gem_clflush_init(struct drm_i915_private *i915)
+{
+ clflush_context = dma_fence_context_alloc(1);
+}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 55a8db2690c6..467725259f8c 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1115,13 +1115,13 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
continue;
+ if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
+ i915_gem_clflush_object(obj, false);
+
ret = i915_gem_request_await_object
(req, obj, obj->base.pending_write_domain);
if (ret)
return ret;
-
- if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
- i915_gem_clflush_object(obj, false);
}
/* Unconditionally flush any chipset caches (for streaming writes). */
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 6e7f8a47b862..8ee4e84dc1b8 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -14155,12 +14155,39 @@ intel_prepare_plane_fb(struct drm_plane *plane,
{
struct intel_atomic_state *intel_state =
to_intel_atomic_state(new_state->state);
- struct drm_device *dev = plane->dev;
- struct drm_i915_private *dev_priv = to_i915(dev);
+ struct drm_i915_private *dev_priv = to_i915(plane->dev);
struct drm_framebuffer *fb = new_state->fb;
struct drm_i915_gem_object *obj = intel_fb_obj(fb);
int ret;
+ if (obj) {
+ if (plane->type == DRM_PLANE_TYPE_CURSOR &&
+ INTEL_INFO(dev_priv)->cursor_needs_physical) {
+ int align = IS_I830(dev_priv) ? 16 * 1024 : 256;
+
+ ret = i915_gem_object_attach_phys(obj, align);
+ if (ret) {
+ DRM_DEBUG_KMS("failed to attach phys object\n");
+ return ret;
+ }
+ } else {
+ struct i915_vma *vma;
+
+ vma = intel_pin_and_fence_fb_obj(fb, new_state->rotation);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ to_intel_plane_state(new_state)->vma = vma;
+ if (to_intel_plane_state(plane->state)->vma != vma) {
+ struct intel_crtc_state *crtc_state;
+
+ crtc_state = intel_atomic_get_crtc_state(new_state->state,
+ to_intel_crtc(new_state->crtc));
+ crtc_state->fb_changed = true;
+ }
+ }
+ }
+
if (plane->state->fb && old_plane_needs_modeset(plane, new_state)) {
struct drm_i915_gem_object *old_obj =
intel_fb_obj(plane->state->fb);
@@ -14207,32 +14234,6 @@ intel_prepare_plane_fb(struct drm_plane *plane,
i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY);
}
- if (plane->type == DRM_PLANE_TYPE_CURSOR &&
- INTEL_INFO(dev)->cursor_needs_physical) {
- int align = IS_I830(dev_priv) ? 16 * 1024 : 256;
-
- ret = i915_gem_object_attach_phys(obj, align);
- if (ret) {
- DRM_DEBUG_KMS("failed to attach phys object\n");
- return ret;
- }
- } else {
- struct i915_vma *vma;
-
- vma = intel_pin_and_fence_fb_obj(fb, new_state->rotation);
- if (IS_ERR(vma))
- return PTR_ERR(vma);
-
- to_intel_plane_state(new_state)->vma = vma;
- if (to_intel_plane_state(plane->state)->vma != vma) {
- struct intel_crtc_state *crtc_state;
-
- crtc_state = intel_atomic_get_crtc_state(new_state->state,
- to_intel_crtc(new_state->crtc));
- crtc_state->fb_changed = true;
- }
- }
-
return 0;
}
--
2.10.2
More information about the Intel-gfx
mailing list