[Intel-gfx] [RFC][PATCH] drm/i915: Try to avoid most cache flushes on LLC platforms (WIP)

ville.syrjala at linux.intel.com ville.syrjala at linux.intel.com
Mon Aug 5 22:13:24 CEST 2013


From: Ville Syrjälä <ville.syrjala at linux.intel.com>

A bit more food for the cache discussions. My idea here is to track
whether an object is part of any fb, and in case it is we assume it
can be used for scanout, and thus may need some extra clflushes.
But otherwise we'd try to avoid clflushes on LLC plarforms.

I also included some GFDT stuff in there just to think a bit about how
it would work, but I left out the GFDT flushes themselves.

Also I didn't even boot-test this, so it may explode quite
spectacularly.
---
 drivers/gpu/drm/i915/i915_drv.h            |  5 +++
 drivers/gpu/drm/i915/i915_gem.c            | 63 +++++++++++++++++++++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 +++++-
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  6 ++-
 drivers/gpu/drm/i915/intel_display.c       |  2 +
 drivers/gpu/drm/i915/intel_fb.c            |  1 +
 6 files changed, 72 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8b14e22..ec275ed 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -451,6 +451,8 @@ enum i915_cache_level {
 	I915_CACHE_NONE = 0,
 	I915_CACHE_LLC,
 	I915_CACHE_LLC_MLC, /* gen6+, in docs at least! */
+	I915_CACHE_WT, /* hsw gt3e */
+	I915_CACHE_GFDT = 0x4, /* SNB/IVB: flag ORed w/ the cache mode */
 };
 
 typedef uint32_t gen6_gtt_pte_t;
@@ -1388,6 +1390,8 @@ struct drm_i915_gem_object {
 
 	/** for phy allocated objects */
 	struct drm_i915_gem_phys_object *phys_obj;
+
+	atomic_t fb_count;
 };
 #define to_gem_object(obj) (&((struct drm_i915_gem_object *)(obj))->base)
 
@@ -1494,6 +1498,7 @@ struct drm_i915_file_private {
 #define HAS_VEBOX(dev)          (INTEL_INFO(dev)->has_vebox_ring)
 #define HAS_LLC(dev)            (INTEL_INFO(dev)->has_llc)
 #define I915_NEED_GFX_HWS(dev)	(INTEL_INFO(dev)->need_gfx_hws)
+#define HAS_GFDT(dev)		(0)
 
 #define HAS_HW_CONTEXTS(dev)	(INTEL_INFO(dev)->gen >= 6)
 #define HAS_ALIASING_PPGTT(dev)	(INTEL_INFO(dev)->gen >=6 && !IS_VALLEYVIEW(dev))
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bd6eb64..38d3241 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -394,6 +394,20 @@ shmem_pread_slow(struct page *page, int shmem_page_offset, int page_length,
 	return ret ? - EFAULT : 0;
 }
 
+static bool cpu_caches_coherent(struct drm_i915_gem_object *obj)
+{
+	return HAS_LLC(obj->base.dev) || obj->cache_level != I915_CACHE_NONE;
+}
+
+static bool
+scanout_needs_clflush(struct drm_i915_gem_object *obj,
+		      enum i915_cache_level cache_level)
+{
+	return cache_level != I915_CACHE_LLC &&
+		cache_level != I915_CACHE_LLC_MLC &&
+		atomic_read(&obj->fb_count) > 0;
+}
+
 static int
 i915_gem_shmem_pread(struct drm_device *dev,
 		     struct drm_i915_gem_object *obj,
@@ -419,7 +433,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 		 * read domain and manually flush cachelines (if required). This
 		 * optimizes for the case when the gpu will dirty the data
 		 * anyway again before the next pread happens. */
-		if (obj->cache_level == I915_CACHE_NONE)
+		if (!cpu_caches_coherent(obj))
 			needs_clflush = 1;
 		if (i915_gem_obj_ggtt_bound(obj)) {
 			ret = i915_gem_object_set_to_gtt_domain(obj, false);
@@ -736,7 +750,8 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		 * write domain and manually flush cachelines (if required). This
 		 * optimizes for the case when the gpu will use the data
 		 * right away and we therefore have to clflush anyway. */
-		if (obj->cache_level == I915_CACHE_NONE)
+		if (!cpu_caches_coherent(obj) ||
+		    scanout_needs_clflush(obj, obj->cache_level))
 			needs_clflush_after = 1;
 		if (i915_gem_obj_ggtt_bound(obj)) {
 			ret = i915_gem_object_set_to_gtt_domain(obj, true);
@@ -746,8 +761,8 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 	}
 	/* Same trick applies for invalidate partially written cachelines before
 	 * writing.  */
-	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)
-	    && obj->cache_level == I915_CACHE_NONE)
+	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU) &&
+	    !cpu_caches_coherent(obj))
 		needs_clflush_before = 1;
 
 	ret = i915_gem_object_get_pages(obj);
@@ -826,7 +841,9 @@ out:
 		 * out of the cpu write domain while we've dropped the lock.
 		 */
 		if (!needs_clflush_after &&
-		    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
+		    obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
+		    (!cpu_caches_coherent(obj) ||
+		     scanout_needs_clflush(obj, obj->cache_level))) {
 			i915_gem_clflush_object(obj);
 			i915_gem_chipset_flush(dev);
 		}
@@ -1256,7 +1273,7 @@ i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
 	}
 
 	/* Pinned buffers may be scanout, so flush the cache */
-	if (obj->pin_count)
+	if (scanout_needs_clflush(obj, obj->cache_level))
 		i915_gem_object_flush_cpu_write_domain(obj);
 
 	drm_gem_object_unreference(&obj->base);
@@ -3211,9 +3228,6 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj)
 	 * snooping behaviour occurs naturally as the result of our domain
 	 * tracking.
 	 */
-	if (obj->cache_level != I915_CACHE_NONE)
-		return;
-
 	trace_i915_gem_object_clflush(obj);
 
 	drm_clflush_sg(obj->pages);
@@ -3255,8 +3269,11 @@ i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj)
 	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
 		return;
 
-	i915_gem_clflush_object(obj);
-	i915_gem_chipset_flush(obj->base.dev);
+	if (!cpu_caches_coherent(obj) ||
+	    scanout_needs_clflush(obj, obj->cache_level)) {
+		i915_gem_clflush_object(obj);
+		i915_gem_chipset_flush(obj->base.dev);
+	}
 	old_write_domain = obj->base.write_domain;
 	obj->base.write_domain = 0;
 
@@ -3289,7 +3306,9 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
 	if (ret)
 		return ret;
 
-	i915_gem_object_flush_cpu_write_domain(obj);
+	if (!cpu_caches_coherent(obj) ||
+	    scanout_needs_clflush(obj, obj->cache_level))
+		i915_gem_object_flush_cpu_write_domain(obj);
 
 	/* Serialise direct access to this object with the barriers for
 	 * coherent writes from the GPU, by effectively invalidating the
@@ -3374,7 +3393,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 					       obj, cache_level);
 	}
 
-	if (cache_level == I915_CACHE_NONE) {
+	if (scanout_needs_clflush(obj, cache_level)) {
 		u32 old_read_domains, old_write_domain;
 
 		/* If we're coming from LLC cached, then we haven't
@@ -3476,6 +3495,7 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 				     u32 alignment,
 				     struct intel_ring_buffer *pipelined)
 {
+	enum i915_cache_level cache_level;
 	u32 old_read_domains, old_write_domain;
 	int ret;
 
@@ -3494,7 +3514,19 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 	 * of uncaching, which would allow us to flush all the LLC-cached data
 	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
 	 */
-	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
+	if (0) { /* HSW GT3e */
+		cache_level = obj->cache_level;
+		if (cache_level != I915_CACHE_NONE)
+			cache_level = I915_CACHE_WT;
+	} else if (HAS_GFDT(obj->base.dev)) {
+		cache_level = obj->cache_level;
+		if (cache_level != I915_CACHE_NONE)
+			cache_level |= I915_CACHE_GFDT;
+	} else {
+		cache_level = I915_CACHE_NONE;
+	}
+
+	ret = i915_gem_object_set_cache_level(obj, cache_level);
 	if (ret)
 		return ret;
 
@@ -3567,7 +3599,8 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 
 	/* Flush the CPU cache if it's still invalid. */
 	if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) {
-		i915_gem_clflush_object(obj);
+		if (!cpu_caches_coherent(obj))
+			i915_gem_clflush_object(obj);
 
 		obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 9939d2e..ceead5e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -702,12 +702,18 @@ err:
 	return ret;
 }
 
+static bool cpu_caches_coherent(struct drm_i915_gem_object *obj)
+{
+	return HAS_LLC(obj->base.dev) || obj->cache_level != I915_CACHE_NONE;
+}
+
 static int
 i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
 				struct list_head *objects)
 {
 	struct drm_i915_gem_object *obj;
 	uint32_t flush_domains = 0;
+	bool need_chipset_flush = false;
 	int ret;
 
 	list_for_each_entry(obj, objects, exec_list) {
@@ -715,13 +721,16 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
 		if (ret)
 			return ret;
 
-		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
+		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
+		    !cpu_caches_coherent(obj)) {
 			i915_gem_clflush_object(obj);
+			need_chipset_flush = true;
+		}
 
 		flush_domains |= obj->base.write_domain;
 	}
 
-	if (flush_domains & I915_GEM_DOMAIN_CPU)
+	if (need_chipset_flush)
 		i915_gem_chipset_flush(ring->dev);
 
 	if (flush_domains & I915_GEM_DOMAIN_GTT)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index f38cc69..ea1ef86 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -44,6 +44,7 @@
 #define HSW_PTE_UNCACHED		(0)
 #define GEN6_PTE_CACHE_LLC		(2 << 1)
 #define GEN6_PTE_CACHE_LLC_MLC		(3 << 1)
+#define GEN6_PTE_GFDT			(1 << 3)
 #define GEN6_PTE_ADDR_ENCODE(addr)	GEN6_GTT_ADDR_ENCODE(addr)
 #define HSW_PTE_ADDR_ENCODE(addr)	HSW_GTT_ADDR_ENCODE(addr)
 
@@ -62,7 +63,10 @@ static gen6_gtt_pte_t gen6_pte_encode(dma_addr_t addr,
 	gen6_gtt_pte_t pte = GEN6_PTE_VALID;
 	pte |= GEN6_PTE_ADDR_ENCODE(addr);
 
-	switch (level) {
+	if (level & I915_CACHE_GFDT)
+		pte |= GEN6_PTE_GFDT;
+
+	switch (level & ~I915_CACHE_GFDT) {
 	case I915_CACHE_LLC_MLC:
 		pte |= GEN6_PTE_CACHE_LLC_MLC;
 		break;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 8afdcfe..c4c7c52 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -9409,6 +9409,7 @@ static void intel_user_framebuffer_destroy(struct drm_framebuffer *fb)
 	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 
 	drm_framebuffer_cleanup(fb);
+	atomic_dec(&intel_fb->obj->fb_count);
 	drm_gem_object_unreference_unlocked(&intel_fb->obj->base);
 
 	kfree(intel_fb);
@@ -9527,6 +9528,7 @@ int intel_framebuffer_init(struct drm_device *dev,
 
 	drm_helper_mode_fill_fb_struct(&intel_fb->base, mode_cmd);
 	intel_fb->obj = obj;
+	atomic_inc(&obj->fb_count);
 
 	ret = drm_framebuffer_init(dev, &intel_fb->base, &intel_fb_funcs);
 	if (ret) {
diff --git a/drivers/gpu/drm/i915/intel_fb.c b/drivers/gpu/drm/i915/intel_fb.c
index f3c97e0..10b7c36 100644
--- a/drivers/gpu/drm/i915/intel_fb.c
+++ b/drivers/gpu/drm/i915/intel_fb.c
@@ -210,6 +210,7 @@ static void intel_fbdev_destroy(struct drm_device *dev,
 	drm_framebuffer_unregister_private(&ifb->base);
 	drm_framebuffer_cleanup(&ifb->base);
 	if (ifb->obj) {
+		atomic_dec(&ifb->obj->fb_count);
 		drm_gem_object_unreference_unlocked(&ifb->obj->base);
 		ifb->obj = NULL;
 	}
-- 
1.8.1.5




More information about the Intel-gfx mailing list