[Intel-gfx] [PATCH 3/6] drm/i915: Opportunistically reduce flushing at execbuf

Ben Widawsky benjamin.widawsky at intel.com
Mon Feb 9 13:54:16 PST 2015


If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
we've already blown out the entire cache via a wbinvd, there is nothing more to
do.

With this and the previous patches, I am seeing a 3x FPS increase on a certain
benchmark which uses a giant 2d array texture. Unless I missed something in the
code, it should only effect non-LLC i915 platforms.

I haven't yet run any numbers for other benchmarks, nor have I attempted to
check if various conformance tests still pass.

v2: Rewrite the patch to be i915 only
Obtain whether or not we wbinvd up front.

Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h            |  8 ++++++++
 drivers/gpu/drm/i915/i915_gem.c            | 11 +++++------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 20 ++++++++++++++++----
 drivers/gpu/drm/i915/intel_lrc.c           | 10 ++++++++--
 4 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 90ff6aa..5d2f62d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1643,6 +1643,7 @@ struct i915_workarounds {
 
 struct eb_vmas {
 	struct list_head vmas;
+	bool do_wbinvd;
 	int and;
 	union {
 		struct i915_vma *lut[0];
@@ -1913,6 +1914,8 @@ struct drm_i915_private {
 		void (*stop_ring)(struct intel_engine_cs *ring);
 	} gt;
 
+	size_t wbinvd_threshold;
+
 	uint32_t request_uniq;
 
 	/*
@@ -2810,6 +2813,11 @@ static inline bool i915_stop_ring_allow_warn(struct drm_i915_private *dev_priv)
 
 void i915_gem_reset(struct drm_device *dev);
 bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
+static inline bool cpu_cache_is_coherent(struct drm_device *dev,
+					 enum i915_cache_level level)
+{
+	return HAS_LLC(dev) || level != I915_CACHE_NONE;
+}
 int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj);
 int __must_check i915_gem_init(struct drm_device *dev);
 int i915_gem_init_rings(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fc81889..5bfb332 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -61,12 +61,6 @@ static int i915_gem_shrinker_oom(struct notifier_block *nb,
 				 void *ptr);
 static unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv);
 
-static bool cpu_cache_is_coherent(struct drm_device *dev,
-				  enum i915_cache_level level)
-{
-	return HAS_LLC(dev) || level != I915_CACHE_NONE;
-}
-
 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 {
 	if (!cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
@@ -4878,6 +4872,11 @@ int i915_gem_init(struct drm_device *dev)
 		dev_priv->gt.stop_ring = intel_logical_ring_stop;
 	}
 
+	dev_priv->wbinvd_threshold = boot_cpu_data.x86_cache_size << 10;
+	/* Pick a high default in the unlikely case we got nothing */
+	if (!dev_priv->wbinvd_threshold)
+		dev_priv->wbinvd_threshold = (8 << 20);
+
 	ret = i915_gem_init_userptr(dev);
 	if (ret)
 		goto out_unlock;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 13ed13e..56f9268 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -50,7 +50,7 @@ eb_create(struct drm_i915_gem_execbuffer2 *args)
 		unsigned size = args->buffer_count;
 		size *= sizeof(struct i915_vma *);
 		size += sizeof(struct eb_vmas);
-		eb = kmalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+		eb = kzalloc(size, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
 	}
 
 	if (eb == NULL) {
@@ -78,6 +78,7 @@ eb_reset(struct eb_vmas *eb)
 {
 	if (eb->and >= 0)
 		memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
+	eb->do_wbinvd = false;
 }
 
 static int
@@ -154,6 +155,11 @@ eb_lookup_vmas(struct eb_vmas *eb,
 			hlist_add_head(&vma->exec_node,
 				       &eb->buckets[handle & eb->and]);
 		}
+
+		if (vma->node.size >= to_i915(obj->base.dev)->wbinvd_threshold &&
+		    obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
+		    !cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
+			eb->do_wbinvd = true;
 		++i;
 	}
 
@@ -826,7 +832,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
 	struct list_head *vmas = &eb->vmas;
 	struct i915_vma *vma;
 	uint32_t flush_domains = 0;
-	bool flush_chipset = false;
+	bool flush_chipset = eb->do_wbinvd;
 	int ret;
 
 	list_for_each_entry(vma, vmas, exec_list) {
@@ -835,12 +841,18 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
 		if (ret)
 			return ret;
 
+		flush_domains |= obj->base.write_domain;
+
+		if (eb->do_wbinvd)
+			continue;
+
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
 			flush_chipset |= i915_gem_clflush_object(obj, false);
-
-		flush_domains |= obj->base.write_domain;
 	}
 
+	if (eb->do_wbinvd)
+		wbinvd();
+
 	if (flush_chipset)
 		i915_gem_chipset_flush(ring->dev);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 03741f9..16ca4a2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -586,12 +586,18 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
 		if (ret)
 			return ret;
 
+		flush_domains |= obj->base.write_domain;
+
+		if (eb->do_wbinvd)
+			continue;
+
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
 			i915_gem_clflush_object(obj, false);
-
-		flush_domains |= obj->base.write_domain;
 	}
 
+	if (eb->do_wbinvd)
+		wbinvd();
+
 	if (flush_domains & I915_GEM_DOMAIN_GTT)
 		wmb();
 
-- 
2.3.0



More information about the Intel-gfx mailing list