[Intel-gfx] [PATCH] drm/i915: GFDT support for SNB/IVB

Chris Wilson chris at chris-wilson.co.uk
Wed Mar 6 17:28:09 CET 2013


From: Ville Syrjälä <ville.syrjala at linux.intel.com>

Currently all scanout buffers must be uncached because the
display controller doesn't snoop the LLC. SNB introduced another
method to guarantee coherency for the display controller. It's
called the GFDT or graphics data type.

Pages that have the GFDT bit enabled in their PTEs get flushed
all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
issued with the "synchronize GFDT" bit set.

So rather than making all scanout buffers uncached, set the GFDT
bit in their PTEs, and modify the ring flush functions to enable
the "synchronize GFDT" bit.

On HSW the GFDT bit was removed from the PTE, and it's only present in
surface state, so we can't really set it from the kernel. Also the docs
state that the hardware isn't actually guaranteed to respect the GFDT
bit. So it looks like GFDT isn't all that useful on HSW.

So far I've tried this very quickly on an IVB machine, and
it seems to be working as advertised. No idea if it does any
good though.

On an i5-2520m (laptop) running gnome-shell at 1366x768:
  padman 		140.78 -> 145.98 fps
  openarena 		183.72 -> 186.87 fps
  gtkperf ComboBoxEntry	20.27 -> 22.14s
  gtkperf pixbuf	 1.12 ->  1.47s
  x11perf -aa10text	13.40 -> 13.20 Mglyphs
which are well within the throttling noise.

v2 [ickle]: adapt to comply with existing userspace guarantees

Signed-off-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c            |    4 +
 drivers/gpu/drm/i915/i915_drv.h            |   16 +++-
 drivers/gpu/drm/i915/i915_gem.c            |  111 ++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_gem_context.c    |    5 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   12 ++-
 drivers/gpu/drm/i915/i915_gem_gtt.c        |   38 ++++++----
 drivers/gpu/drm/i915/i915_reg.h            |    2 +
 drivers/gpu/drm/i915/i915_trace.h          |   10 ++-
 drivers/gpu/drm/i915/intel_ringbuffer.c    |   79 +++++++++++++++++---
 drivers/gpu/drm/i915/intel_ringbuffer.h    |    5 +-
 10 files changed, 225 insertions(+), 57 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 66d06ac..ff935f1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -235,6 +235,7 @@ static const struct intel_device_info intel_sandybridge_d_info = {
 	.has_bsd_ring = 1,
 	.has_blt_ring = 1,
 	.has_llc = 1,
+	.has_gfdt = 1,
 	.has_force_wake = 1,
 };
 
@@ -245,6 +246,7 @@ static const struct intel_device_info intel_sandybridge_m_info = {
 	.has_bsd_ring = 1,
 	.has_blt_ring = 1,
 	.has_llc = 1,
+	.has_gfdt = 1,
 	.has_force_wake = 1,
 };
 
@@ -254,6 +256,7 @@ static const struct intel_device_info intel_ivybridge_d_info = {
 	.has_bsd_ring = 1,
 	.has_blt_ring = 1,
 	.has_llc = 1,
+	.has_gfdt = 1,
 	.has_force_wake = 1,
 };
 
@@ -264,6 +267,7 @@ static const struct intel_device_info intel_ivybridge_m_info = {
 	.has_bsd_ring = 1,
 	.has_blt_ring = 1,
 	.has_llc = 1,
+	.has_gfdt = 1,
 	.has_force_wake = 1,
 };
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 535bf29..9841dd7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -367,6 +367,7 @@ struct intel_device_info {
 	u8 has_bsd_ring:1;
 	u8 has_blt_ring:1;
 	u8 has_llc:1;
+	u8 has_gfdt:1;
 };
 
 enum i915_cache_level {
@@ -409,7 +410,8 @@ struct i915_gtt {
 	void (*gtt_insert_entries)(struct drm_device *dev,
 				   struct sg_table *st,
 				   unsigned int pg_start,
-				   enum i915_cache_level cache_level);
+				   enum i915_cache_level cache_level,
+				   bool gfdt);
 };
 #define gtt_total_entries(gtt) ((gtt).total >> PAGE_SHIFT)
 
@@ -430,7 +432,8 @@ struct i915_hw_ppgtt {
 	void (*insert_entries)(struct i915_hw_ppgtt *ppgtt,
 			       struct sg_table *st,
 			       unsigned int pg_start,
-			       enum i915_cache_level cache_level);
+			       enum i915_cache_level cache_level,
+			       bool gfdt);
 	void (*cleanup)(struct i915_hw_ppgtt *ppgtt);
 };
 
@@ -1170,6 +1173,8 @@ struct drm_i915_gem_object {
 	unsigned int fenced_gpu_access:1;
 
 	unsigned int cache_level:2;
+	unsigned int gfdt:1;
+	unsigned int gfdt_dirty:1;
 
 	unsigned int has_aliasing_ppgtt_mapping:1;
 	unsigned int has_global_gtt_mapping:1;
@@ -1328,6 +1333,9 @@ struct drm_i915_file_private {
 #define HAS_LLC(dev)            (INTEL_INFO(dev)->has_llc)
 #define I915_NEED_GFX_HWS(dev)	(INTEL_INFO(dev)->need_gfx_hws)
 
+/* Only SNB and IVB have GFDT in PTEs */
+#define HAS_GFDT(dev)            (INTEL_INFO(dev)->has_gfdt)
+
 #define HAS_HW_CONTEXTS(dev)	(INTEL_INFO(dev)->gen >= 6)
 #define HAS_ALIASING_PPGTT(dev)	(INTEL_INFO(dev)->gen >=6 && !IS_VALLEYVIEW(dev))
 
@@ -1702,14 +1710,14 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
 void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev);
 void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
 			    struct drm_i915_gem_object *obj,
-			    enum i915_cache_level cache_level);
+			    enum i915_cache_level cache_level, bool gfdt);
 void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
 			      struct drm_i915_gem_object *obj);
 
 void i915_gem_restore_gtt_mappings(struct drm_device *dev);
 int __must_check i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj);
 void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
-				enum i915_cache_level cache_level);
+			      enum i915_cache_level cache_level, bool gfdt);
 void i915_gem_gtt_unbind_object(struct drm_i915_gem_object *obj);
 void i915_gem_gtt_finish_object(struct drm_i915_gem_object *obj);
 void i915_gem_init_global_gtt(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 19fc21b..2b4bc88 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2282,6 +2282,25 @@ i915_gem_retire_work_handler(struct work_struct *work)
 	mutex_unlock(&dev->struct_mutex);
 }
 
+static int i915_gem_object_flush_gfdt(struct drm_i915_gem_object *obj,
+				      struct intel_ring_buffer *ring)
+{
+	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
+	int ret;
+
+	if (!obj->gfdt_dirty)
+		return 0;
+
+	if (ring == NULL)
+		ring = &dev_priv->ring[BCS];
+	ret = intel_ring_flush_internal(ring, I915_FLUSH_GFDT);
+	if (ret)
+		return ret;
+
+	obj->gfdt_dirty = false;
+	return 0;
+}
+
 /**
  * Ensures that an object will eventually get non-busy by flushing any required
  * write domains, emitting any outstanding lazy request and retiring and
@@ -2292,6 +2311,10 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
 {
 	int ret;
 
+	ret = i915_gem_object_flush_gfdt(obj, obj->ring);
+	if (ret)
+		return ret;
+
 	if (obj->active) {
 		ret = i915_gem_check_olr(obj->ring, obj->last_read_seqno);
 		if (ret)
@@ -3034,6 +3057,9 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj)
 	if (obj->stolen)
 		return;
 
+	if (obj->gfdt)
+		obj->gfdt_dirty = 1;
+
 	/* If the GPU is snooping the contents of the CPU cache,
 	 * we do not need to manually clear the CPU cache lines.  However,
 	 * the caches are only snooped when the render cache is
@@ -3161,7 +3187,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret;
 
-	if (obj->cache_level == cache_level)
+	if (obj->cache_level == cache_level && !obj->gfdt)
 		return 0;
 
 	if (obj->pin_count) {
@@ -3193,10 +3219,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 		}
 
 		if (obj->has_global_gtt_mapping)
-			i915_gem_gtt_bind_object(obj, cache_level);
+			i915_gem_gtt_bind_object(obj, cache_level, false);
 		if (obj->has_aliasing_ppgtt_mapping)
 			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
-					       obj, cache_level);
+					       obj, cache_level, false);
 
 		obj->gtt_space->color = cache_level;
 	}
@@ -3225,6 +3251,48 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 	}
 
 	obj->cache_level = cache_level;
+	obj->gfdt_dirty = obj->gfdt = false;
+	i915_gem_verify_gtt(dev);
+	return 0;
+}
+
+static int i915_gem_object_set_gfdt(struct drm_i915_gem_object *obj,
+				    bool gfdt)
+{
+	struct drm_device *dev = obj->base.dev;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	int ret;
+
+	if (!HAS_GFDT(dev))
+		return -ENODEV;
+
+	if (obj->gfdt == gfdt)
+		return 0;
+
+	/* no point in setting GFDT on uncached object */
+	if (obj->cache_level == I915_CACHE_NONE)
+		return -EINVAL;
+
+	if (obj->gtt_space) {
+		ret = i915_gem_object_finish_gpu(obj);
+		if (ret)
+			return ret;
+
+		i915_gem_object_finish_gtt(obj);
+
+		if (obj->has_global_gtt_mapping)
+			i915_gem_gtt_bind_object(obj, obj->cache_level, gfdt);
+		if (obj->has_aliasing_ppgtt_mapping)
+			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
+					       obj, obj->cache_level, gfdt);
+	}
+
+	/* Explicity perform the clflush to clear untagged dirty data */
+	if (obj->pages && !obj->stolen)
+		drm_clflush_sg(obj->pages);
+
+	obj->gfdt = gfdt;
+	obj->gfdt_dirty = false;
 	i915_gem_verify_gtt(dev);
 	return 0;
 }
@@ -3310,18 +3378,23 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 			return ret;
 	}
 
-	/* The display engine is not coherent with the LLC cache on gen6.  As
-	 * a result, we make sure that the pinning that is about to occur is
-	 * done with uncached PTEs. This is lowest common denominator for all
-	 * chipsets.
-	 *
-	 * However for gen6+, we could do better by using the GFDT bit instead
-	 * of uncaching, which would allow us to flush all the LLC-cached data
-	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
+	/*
+	 * Try to set the GFDT bit instead of uncaching. This allow us to flush
+	 * all the LLC-cached data with that bit in the PTE to main memory with
+	 * just one PIPE_CONTROL.
 	 */
-	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
-	if (ret)
-		return ret;
+	ret = i915_gem_object_set_gfdt(obj, true);
+	if (ret) {
+		/*
+		 * The display engine is not coherent with the LLC cache on gen6.  As
+		 * a result, we make sure that the pinning that is about to occur is
+		 * done with uncached PTEs. This is lowest common denominator for all
+		 * chipsets.
+		 */
+		ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
+		if (ret)
+			return ret;
+	}
 
 	/* As the user may map the buffer once pinned in the display plane
 	 * (e.g. libkms for the bootup splash), we have to ensure that we
@@ -3346,6 +3419,12 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 					    old_read_domains,
 					    old_write_domain);
 
+	ret = i915_gem_object_flush_gfdt(obj, pipelined);
+	if (ret) {
+		i915_gem_object_unpin(obj);
+		return ret;
+	}
+
 	return 0;
 }
 
@@ -3505,11 +3584,11 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj,
 			return ret;
 
 		if (!dev_priv->mm.aliasing_ppgtt)
-			i915_gem_gtt_bind_object(obj, obj->cache_level);
+			i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 	}
 
 	if (!obj->has_global_gtt_mapping && map_and_fenceable)
-		i915_gem_gtt_bind_object(obj, obj->cache_level);
+		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 
 	obj->pin_count++;
 	obj->pin_mappable |= map_and_fenceable;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 94d873a..b8301d3 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -309,7 +309,7 @@ mi_set_context(struct intel_ring_buffer *ring,
 	 * itlb_before_ctx_switch.
 	 */
 	if (IS_GEN6(ring->dev) && ring->itlb_before_ctx_switch) {
-		ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, 0);
+		ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, 0, 0);
 		if (ret)
 			return ret;
 	}
@@ -371,7 +371,8 @@ static int do_switch(struct i915_hw_context *to)
 	}
 
 	if (!to->obj->has_global_gtt_mapping)
-		i915_gem_gtt_bind_object(to->obj, to->obj->cache_level);
+		i915_gem_gtt_bind_object(to->obj, to->obj->cache_level,
+					 to->obj->gfdt);
 
 	if (!to->is_initialized || is_default_context(to))
 		hw_flags |= MI_RESTORE_INHIBIT;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6ae62ea..4ad0323 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -197,7 +197,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	    reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
 	    !target_i915_obj->has_global_gtt_mapping)) {
 		i915_gem_gtt_bind_object(target_i915_obj,
-					 target_i915_obj->cache_level);
+					 target_i915_obj->cache_level,
+					 target_i915_obj->gfdt);
 	}
 
 	/* Validate that the target is in a valid r/w GPU domain */
@@ -433,7 +434,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
 	/* Ensure ppgtt mapping exists if needed */
 	if (dev_priv->mm.aliasing_ppgtt && !obj->has_aliasing_ppgtt_mapping) {
 		i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
-				       obj, obj->cache_level);
+				       obj, obj->cache_level, obj->gfdt);
 
 		obj->has_aliasing_ppgtt_mapping = 1;
 	}
@@ -450,7 +451,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
 
 	if (entry->flags & EXEC_OBJECT_NEEDS_GTT &&
 	    !obj->has_global_gtt_mapping)
-		i915_gem_gtt_bind_object(obj, obj->cache_level);
+		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 
 	return 0;
 }
@@ -778,6 +779,8 @@ i915_gem_execbuffer_move_to_active(struct list_head *objects,
 		i915_gem_object_move_to_active(obj, ring);
 		if (obj->base.write_domain) {
 			obj->dirty = 1;
+			if (obj->gfdt)
+				obj->gfdt_dirty = 1;
 			obj->last_write_seqno = intel_ring_get_seqno(ring);
 			if (obj->pin_count) /* check for potential scanout */
 				intel_mark_fb_busy(obj);
@@ -1011,7 +1014,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * hsw should have this fixed, but let's be paranoid and do it
 	 * unconditionally for now. */
 	if (flags & I915_DISPATCH_SECURE && !batch_obj->has_global_gtt_mapping)
-		i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level);
+		i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level,
+					 batch_obj->gfdt);
 
 	ret = i915_gem_execbuffer_move_to_gpu(ring, &eb->objects);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 926a1e2..3c7e48b 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -42,15 +42,20 @@ typedef uint32_t gtt_pte_t;
 #define HSW_PTE_UNCACHED		(0)
 #define GEN6_PTE_CACHE_LLC		(2 << 1)
 #define GEN6_PTE_CACHE_LLC_MLC		(3 << 1)
+#define GEN6_PTE_GFDT			(1 << 3)
 #define GEN6_PTE_ADDR_ENCODE(addr)	GEN6_GTT_ADDR_ENCODE(addr)
 
 static inline gtt_pte_t gen6_pte_encode(struct drm_device *dev,
 					dma_addr_t addr,
-					enum i915_cache_level level)
+					enum i915_cache_level level,
+					bool gfdt)
 {
 	gtt_pte_t pte = GEN6_PTE_VALID;
 	pte |= GEN6_PTE_ADDR_ENCODE(addr);
 
+	if (gfdt && HAS_GFDT(dev))
+		pte |= GEN6_PTE_GFDT;
+
 	switch (level) {
 	case I915_CACHE_LLC_MLC:
 		/* Haswell doesn't set L3 this way */
@@ -89,7 +94,7 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
 
 	scratch_pte = gen6_pte_encode(ppgtt->dev,
 				      ppgtt->scratch_page_dma_addr,
-				      I915_CACHE_LLC);
+				      I915_CACHE_LLC, false);
 
 	while (num_entries) {
 		last_pte = first_pte + num_entries;
@@ -112,7 +117,8 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
 static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
 				      struct sg_table *pages,
 				      unsigned first_entry,
-				      enum i915_cache_level cache_level)
+				      enum i915_cache_level cache_level,
+				      bool gfdt)
 {
 	gtt_pte_t *pt_vaddr;
 	unsigned act_pd = first_entry / I915_PPGTT_PT_ENTRIES;
@@ -133,7 +139,7 @@ static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
 		for (j = first_pte; j < I915_PPGTT_PT_ENTRIES; j++) {
 			page_addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
 			pt_vaddr[j] = gen6_pte_encode(ppgtt->dev, page_addr,
-						      cache_level);
+						      cache_level, gfdt);
 
 			/* grab the next page */
 			if (++m == segment_len) {
@@ -279,11 +285,12 @@ void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev)
 
 void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
 			    struct drm_i915_gem_object *obj,
-			    enum i915_cache_level cache_level)
+			    enum i915_cache_level cache_level,
+			    bool gfdt)
 {
 	ppgtt->insert_entries(ppgtt, obj->pages,
 			      obj->gtt_space->start >> PAGE_SHIFT,
-			      cache_level);
+			      cache_level, gfdt);
 }
 
 void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
@@ -401,7 +408,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
 		i915_gem_clflush_object(obj);
-		i915_gem_gtt_bind_object(obj, obj->cache_level);
+		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 	}
 
 	i915_gem_chipset_flush(dev);
@@ -429,7 +436,8 @@ int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
 static void gen6_ggtt_insert_entries(struct drm_device *dev,
 				     struct sg_table *st,
 				     unsigned int first_entry,
-				     enum i915_cache_level level)
+				     enum i915_cache_level level,
+				     bool gfdt)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct scatterlist *sg = st->sgl;
@@ -443,7 +451,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
 		len = sg_dma_len(sg) >> PAGE_SHIFT;
 		for (m = 0; m < len; m++) {
 			addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
-			iowrite32(gen6_pte_encode(dev, addr, level),
+			iowrite32(gen6_pte_encode(dev, addr, level, gfdt),
 				  &gtt_entries[i]);
 			i++;
 		}
@@ -457,7 +465,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
 	 */
 	if (i != 0)
 		WARN_ON(readl(&gtt_entries[i-1])
-			!= gen6_pte_encode(dev, addr, level));
+			!= gen6_pte_encode(dev, addr, level, gfdt));
 
 	/* This next bit makes the above posting read even more important. We
 	 * want to flush the TLBs only after we're certain all the PTE updates
@@ -483,7 +491,7 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
 		num_entries = max_entries;
 
 	scratch_pte = gen6_pte_encode(dev, dev_priv->gtt.scratch_page_dma,
-				      I915_CACHE_LLC);
+				      I915_CACHE_LLC, false);
 	for (i = 0; i < num_entries; i++)
 		iowrite32(scratch_pte, &gtt_base[i]);
 	readl(gtt_base);
@@ -493,7 +501,8 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
 static void i915_ggtt_insert_entries(struct drm_device *dev,
 				     struct sg_table *st,
 				     unsigned int pg_start,
-				     enum i915_cache_level cache_level)
+				     enum i915_cache_level cache_level,
+				     bool gfdt)
 {
 	unsigned int flags = (cache_level == I915_CACHE_NONE) ?
 		AGP_USER_MEMORY : AGP_USER_CACHED_MEMORY;
@@ -511,14 +520,15 @@ static void i915_ggtt_clear_range(struct drm_device *dev,
 
 
 void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
-			      enum i915_cache_level cache_level)
+			      enum i915_cache_level cache_level,
+			      bool gfdt)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	dev_priv->gtt.gtt_insert_entries(dev, obj->pages,
 					 obj->gtt_space->start >> PAGE_SHIFT,
-					 cache_level);
+					 cache_level, gfdt);
 
 	obj->has_global_gtt_mapping = 1;
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 4cf3ece..6675af2 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -242,6 +242,7 @@
 #define MI_FLUSH_DW		MI_INSTR(0x26, 1) /* for GEN6 */
 #define   MI_FLUSH_DW_STORE_INDEX	(1<<21)
 #define   MI_INVALIDATE_TLB		(1<<18)
+#define   MI_SYNCHRONIZE_GFDT		(1<<17)
 #define   MI_FLUSH_DW_OP_STOREDW	(1<<14)
 #define   MI_INVALIDATE_BSD		(1<<7)
 #define   MI_FLUSH_DW_USE_GTT		(1<<2)
@@ -311,6 +312,7 @@
 #define   PIPE_CONTROL_GLOBAL_GTT_IVB			(1<<24) /* gen7+ */
 #define   PIPE_CONTROL_CS_STALL				(1<<20)
 #define   PIPE_CONTROL_TLB_INVALIDATE			(1<<18)
+#define   PIPE_CONTROL_SYNCHRONIZE_GFDT			(1<<17)
 #define   PIPE_CONTROL_QW_WRITE				(1<<14)
 #define   PIPE_CONTROL_DEPTH_STALL			(1<<13)
 #define   PIPE_CONTROL_WRITE_FLUSH			(1<<12)
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 3db4a68..ae2c7b5 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -252,14 +252,15 @@ TRACE_EVENT(i915_gem_ring_dispatch,
 );
 
 TRACE_EVENT(i915_gem_ring_flush,
-	    TP_PROTO(struct intel_ring_buffer *ring, u32 invalidate, u32 flush),
-	    TP_ARGS(ring, invalidate, flush),
+	    TP_PROTO(struct intel_ring_buffer *ring, u32 invalidate, u32 flush, u32 internal),
+	    TP_ARGS(ring, invalidate, flush, internal),
 
 	    TP_STRUCT__entry(
 			     __field(u32, dev)
 			     __field(u32, ring)
 			     __field(u32, invalidate)
 			     __field(u32, flush)
+			     __field(u32, internal)
 			     ),
 
 	    TP_fast_assign(
@@ -267,11 +268,12 @@ TRACE_EVENT(i915_gem_ring_flush,
 			   __entry->ring = ring->id;
 			   __entry->invalidate = invalidate;
 			   __entry->flush = flush;
+			   __entry->internal = internal;
 			   ),
 
-	    TP_printk("dev=%u, ring=%x, invalidate=%04x, flush=%04x",
+	    TP_printk("dev=%u, ring=%x, invalidate=%04x, flush=%04x, internal=%08x",
 		      __entry->dev, __entry->ring,
-		      __entry->invalidate, __entry->flush)
+		      __entry->invalidate, __entry->flush, __entry->internal)
 );
 
 DECLARE_EVENT_CLASS(i915_gem_request,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 1d5d613..9b1ae4d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -54,7 +54,8 @@ static inline int ring_space(struct intel_ring_buffer *ring)
 static int
 gen2_render_ring_flush(struct intel_ring_buffer *ring,
 		       u32	invalidate_domains,
-		       u32	flush_domains)
+		       u32	flush_domains,
+		       u32	internal)
 {
 	u32 cmd;
 	int ret;
@@ -80,7 +81,8 @@ gen2_render_ring_flush(struct intel_ring_buffer *ring,
 static int
 gen4_render_ring_flush(struct intel_ring_buffer *ring,
 		       u32	invalidate_domains,
-		       u32	flush_domains)
+		       u32	flush_domains,
+		       u32	internal)
 {
 	struct drm_device *dev = ring->dev;
 	u32 cmd;
@@ -210,7 +212,9 @@ intel_emit_post_sync_nonzero_flush(struct intel_ring_buffer *ring)
 
 static int
 gen6_render_ring_flush(struct intel_ring_buffer *ring,
-                         u32 invalidate_domains, u32 flush_domains)
+		       u32 invalidate_domains,
+		       u32 flush_domains,
+		       u32 internal)
 {
 	u32 flags = 0;
 	struct pipe_control *pc = ring->private;
@@ -248,6 +252,12 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 	}
 
+	/* Flush GFDT out to memory */
+	if (internal & I915_FLUSH_GFDT) {
+		flags |= PIPE_CONTROL_QW_WRITE;
+		flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
+	}
+
 	ret = intel_ring_begin(ring, 4);
 	if (ret)
 		return ret;
@@ -282,7 +292,9 @@ gen7_render_ring_cs_stall_wa(struct intel_ring_buffer *ring)
 
 static int
 gen7_render_ring_flush(struct intel_ring_buffer *ring,
-		       u32 invalidate_domains, u32 flush_domains)
+		       u32 invalidate_domains,
+		       u32 flush_domains,
+		       u32 internal)
 {
 	u32 flags = 0;
 	struct pipe_control *pc = ring->private;
@@ -325,6 +337,12 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
 		 * invalidate bit set. */
 		gen7_render_ring_cs_stall_wa(ring);
 	}
+	/* Flush GFDT out to memory */
+	if (internal & I915_FLUSH_GFDT) {
+		flags |= PIPE_CONTROL_QW_WRITE;
+		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
+		flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
+	}
 
 	ret = intel_ring_begin(ring, 4);
 	if (ret)
@@ -912,8 +930,9 @@ void intel_ring_setup_status_page(struct intel_ring_buffer *ring)
 
 static int
 bsd_ring_flush(struct intel_ring_buffer *ring,
-	       u32     invalidate_domains,
-	       u32     flush_domains)
+	       u32	invalidate_domains,
+	       u32	flush_domains,
+	       u32	internal)
 {
 	int ret;
 
@@ -1547,7 +1566,9 @@ static void gen6_bsd_ring_write_tail(struct intel_ring_buffer *ring,
 }
 
 static int gen6_ring_flush(struct intel_ring_buffer *ring,
-			   u32 invalidate, u32 flush)
+			   u32 invalidate,
+			   u32 flush,
+			   u32 internal)
 {
 	uint32_t cmd;
 	int ret;
@@ -1557,6 +1578,12 @@ static int gen6_ring_flush(struct intel_ring_buffer *ring,
 		return ret;
 
 	cmd = MI_FLUSH_DW;
+
+	/* Flush GFDT out to memory */
+	if (internal & I915_FLUSH_GFDT)
+		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
+			MI_SYNCHRONIZE_GFDT;
+
 	/*
 	 * Bspec vol 1c.5 - video engine command streamer:
 	 * "If ENABLED, all TLBs will be invalidated once the flush
@@ -1619,7 +1646,9 @@ gen6_ring_dispatch_execbuffer(struct intel_ring_buffer *ring,
 /* Blitter support (SandyBridge+) */
 
 static int blt_ring_flush(struct intel_ring_buffer *ring,
-			  u32 invalidate, u32 flush)
+			  u32 invalidate,
+			  u32 flush,
+			  u32 internal)
 {
 	uint32_t cmd;
 	int ret;
@@ -1629,6 +1658,12 @@ static int blt_ring_flush(struct intel_ring_buffer *ring,
 		return ret;
 
 	cmd = MI_FLUSH_DW;
+
+	/* Flush GFDT out to memory */
+	if (internal & I915_FLUSH_GFDT)
+		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
+			MI_SYNCHRONIZE_GFDT;
+
 	/*
 	 * Bspec vol 1c.3 - blitter engine command streamer:
 	 * "If ENABLED, all TLBs will be invalidated once the flush
@@ -1889,11 +1924,31 @@ intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
 	if (!ring->gpu_caches_dirty)
 		return 0;
 
-	ret = ring->flush(ring, 0, I915_GEM_GPU_DOMAINS);
+	ret = ring->flush(ring, 0, I915_GEM_GPU_DOMAINS, 0);
+	if (ret)
+		return ret;
+
+	trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS, 0);
+
+	ring->gpu_caches_dirty = false;
+	return 0;
+}
+
+int
+intel_ring_flush_internal(struct intel_ring_buffer *ring, u32 internal)
+{
+	uint32_t flush_domains;
+	int ret;
+
+	flush_domains = 0;
+	if (ring->gpu_caches_dirty)
+		flush_domains = I915_GEM_GPU_DOMAINS;
+
+	ret = ring->flush(ring, 0, flush_domains, internal);
 	if (ret)
 		return ret;
 
-	trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
+	trace_i915_gem_ring_flush(ring, 0, flush_domains, internal);
 
 	ring->gpu_caches_dirty = false;
 	return 0;
@@ -1909,11 +1964,11 @@ intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring)
 	if (ring->gpu_caches_dirty)
 		flush_domains = I915_GEM_GPU_DOMAINS;
 
-	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, flush_domains, 0);
 	if (ret)
 		return ret;
 
-	trace_i915_gem_ring_flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+	trace_i915_gem_ring_flush(ring, I915_GEM_GPU_DOMAINS, flush_domains, 0);
 
 	ring->gpu_caches_dirty = false;
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d66208c..b9aa76e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -80,7 +80,8 @@ struct  intel_ring_buffer {
 				      u32 value);
 	int __must_check (*flush)(struct intel_ring_buffer *ring,
 				  u32	invalidate_domains,
-				  u32	flush_domains);
+				  u32	flush_domains,
+				  u32	internal);
 	int		(*add_request)(struct intel_ring_buffer *ring);
 	/* Some chipsets are not quite as coherent as advertised and need
 	 * an expensive kick to force a true read of the up-to-date seqno.
@@ -220,6 +221,8 @@ int __must_check intel_ring_idle(struct intel_ring_buffer *ring);
 void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno);
 int intel_ring_flush_all_caches(struct intel_ring_buffer *ring);
 int intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring);
+int intel_ring_flush_internal(struct intel_ring_buffer *ring, u32 internal);
+#define I915_FLUSH_GFDT 0x1
 
 int intel_init_render_ring_buffer(struct drm_device *dev);
 int intel_init_bsd_ring_buffer(struct drm_device *dev);
-- 
1.7.10.4




More information about the Intel-gfx mailing list