[Intel-gfx] [PATCH 3/4] agp/intel-gtt: check cache-coherency on i830 class chipsets

Sun May 9 13:41:25 CEST 2010

To quote the intel documentation (on i815, but judging by the code,
the gtt works the same on all i8xx chipsets):

"Note that the 4KB page of physical main memory (that have been mapped
to the graphics aperture through the GTT) must be accesssed strictly
through the aperture. The GMCH does not guarentee data coherency if any
of these pages are accessed directly using their physical addresses.

...

There is _no_ hardware support for ensuring coherency between these
two access paths."

In other words: We're trying to accomplish something explicitly verboten.

Therefore be paranoid and check coherency on every chipset flush.
This reveals that the current code is totally non-working. On my i855GM
cache coherency for cpu to gtt transfers fails 1% of the time. The other
direction is even worse and fails about every second time under load.

This check uses the last page of the gtt, which has been reserved in the
previous patch.

Signed-off-by: Daniel Vetter <daniel.vetter at ffwll.ch>
---
 drivers/char/agp/intel-gtt.c |  140 ++++++++++++++++++++++++++++++++++-------
 include/drm/intel-gtt.h      |    6 +-
 2 files changed, 120 insertions(+), 26 deletions(-)

diff --git a/drivers/char/agp/intel-gtt.c b/drivers/char/agp/intel-gtt.c
index eec043b..bed0ed6 100644
--- a/drivers/char/agp/intel-gtt.c
+++ b/drivers/char/agp/intel-gtt.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/pagemap.h>
 #include <linux/agp_backend.h>
+#include <linux/ratelimit.h>
 #include <asm/smp.h>
 #include "agp.h"
 #include "intel-agp.h"
@@ -68,11 +69,12 @@ static struct _intel_private {
 	 */
 	int gtt_entries;			/* i830+ */
 	int gtt_total_size;
-	union {
-		void __iomem *i9xx_flush_page;
-		void *i8xx_flush_page;
-	};
-	struct page *i8xx_page;
+	void __iomem *i9xx_flush_page;
+	void *i8xx_cpu_flush_page;
+	void *i8xx_cpu_check_page;
+	void __iomem *i8xx_gtt_cc_pages;
+	unsigned int i8xx_cache_flush_num;
+	struct page *i8xx_pages[I830_CC_DANCE_PAGES + 1];
 	struct resource ifp_resource;
 	int resource_valid;
 } intel_private;
@@ -736,27 +738,74 @@ static void intel_i830_init_gtt_entries(void)
 
 static void intel_i830_fini_flush(void)
 {
-	kunmap(intel_private.i8xx_page);
-	intel_private.i8xx_flush_page = NULL;
-	unmap_page_from_agp(intel_private.i8xx_page);
+	int i;
+
+	kunmap(intel_private.i8xx_pages[0]);
+	intel_private.i8xx_cpu_flush_page = NULL;
+	kunmap(intel_private.i8xx_pages[1]);
+	intel_private.i8xx_cpu_check_page = NULL;
+
+	if (intel_private.i8xx_gtt_cc_pages)
+		iounmap(intel_private.i8xx_gtt_cc_pages);
 
-	__free_page(intel_private.i8xx_page);
-	intel_private.i8xx_page = NULL;
+	for (i = 0; i < I830_CC_DANCE_PAGES + 1; i++) {
+		__free_page(intel_private.i8xx_pages[i]);
+		intel_private.i8xx_pages[i] = NULL;
+	}
 }
 
 static void intel_i830_setup_flush(void)
 {
+	int num_entries = A_SIZE_FIX(agp_bridge->current_size)->num_entries;
+	int i;
+
 	/* return if we've already set the flush mechanism up */
-	if (intel_private.i8xx_page)
-		return;
+	if (intel_private.i8xx_pages[0])
+		goto setup;
+
+	for (i = 0; i < I830_CC_DANCE_PAGES + 1; i++) {
+		intel_private.i8xx_pages[i]
+			= alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA32);
+		if (!intel_private.i8xx_pages[i]) {
+			intel_i830_fini_flush();
+			return;
+		}
+	}
 
-	intel_private.i8xx_page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA32);
-	if (!intel_private.i8xx_page)
+	intel_private.i8xx_cpu_check_page = kmap(intel_private.i8xx_pages[1]);
+	if (!intel_private.i8xx_cpu_check_page) {
+		WARN_ON(1);
+		intel_i830_fini_flush();
 		return;
-
-	intel_private.i8xx_flush_page = kmap(intel_private.i8xx_page);
-	if (!intel_private.i8xx_flush_page)
+	}
+	intel_private.i8xx_cpu_flush_page = kmap(intel_private.i8xx_pages[0]);
+	if (!intel_private.i8xx_cpu_flush_page) {
+		WARN_ON(1);
 		intel_i830_fini_flush();
+		return;
+	}
+
+	/* Map the flushing pages into the gtt as the last entries. The last
+	 * page can't be used by the gpu, anyway (prefetch might walk over the
+	 * end of the last page). */
+	intel_private.i8xx_gtt_cc_pages
+		= ioremap_wc(agp_bridge->gart_bus_addr
+				+ num_entries*4096,
+			     I830_CC_DANCE_PAGES*4096);
+
+	if (!intel_private.i8xx_gtt_cc_pages)
+		dev_info(&intel_private.pcidev->dev, "can't ioremap flush page - no chipset flushing");
+
+setup:
+	/* Don't map the first page, we only write via its physical address
+	 * into it. */
+	for (i = 0; i < I830_CC_DANCE_PAGES; i++) {
+		writel(agp_bridge->driver->mask_memory(agp_bridge,
+				page_to_phys(intel_private.i8xx_pages[i+1]), 0),
+		       intel_private.registers+I810_PTE_BASE+((num_entries+i)*4));
+	}
+
+	intel_private.i8xx_cache_flush_num = 0;
 }
 
 /* The chipset_flush interface needs to get data that has already been
@@ -769,16 +818,58 @@ static void intel_i830_setup_flush(void)
  * that buffer out, we just fill 1KB and clflush it out, on the assumption
  * that it'll push whatever was in there out.  It appears to work.
  */
-static void intel_i830_chipset_flush(struct agp_bridge_data *bridge)
-{
-	unsigned int *pg = intel_private.i8xx_flush_page;
 
-	memset(pg, 0, 1024);
+/* Complaining once a minute about cache incoherency is enough! */
+DEFINE_RATELIMIT_STATE(i8xx_chipset_flush_ratelimit_cpu, 60*HZ, 1);
+DEFINE_RATELIMIT_STATE(i8xx_chipset_flush_ratelimit_gtt, 60*HZ, 1);
 
-	if (cpu_has_clflush)
-		clflush_cache_range(pg, 1024);
-	else if (wbinvd_on_all_cpus() != 0)
+static void intel_i830_chipset_flush(struct agp_bridge_data *bridge)
+{
+	unsigned int offset1
+		= (intel_private.i8xx_cache_flush_num * sizeof(int)) % 4096;
+	unsigned int offset2
+		= (intel_private.i8xx_cache_flush_num * sizeof(int)
+				+ 2048) % 4096;
+	unsigned int *p_cpu_read = intel_private.i8xx_cpu_check_page + offset1;
+	unsigned int *p_cpu_write = intel_private.i8xx_cpu_check_page + offset2;
+	unsigned int gtt_read, cpu_read;
+
+	/* write check values */
+	*p_cpu_write = intel_private.i8xx_cache_flush_num;
+	mb();
+	if (cpu_has_clflush) {
+		clflush(p_cpu_write);
+		clflush(p_cpu_read);
+	} else
+		wbinvd_on_all_cpus();
+	writel(intel_private.i8xx_cache_flush_num,
+			intel_private.i8xx_gtt_cc_pages + offset1);
+	mb();
+
+	memset(intel_private.i8xx_cpu_flush_page, 0,
+	       I830_MCH_WRITE_BUFFER_SIZE);
+
+	if (cpu_has_clflush) {
+		clflush_cache_range(intel_private.i8xx_cpu_flush_page,
+				    I830_MCH_WRITE_BUFFER_SIZE);
+	} else if (wbinvd_on_all_cpus() != 0)
 		printk(KERN_ERR "Timed out waiting for cache flush.\n");
+
+	/* read check values */
+	mb();
+	gtt_read = readl(intel_private.i8xx_gtt_cc_pages + offset2);
+	cpu_read = *p_cpu_read;
+
+	WARN(cpu_read != intel_private.i8xx_cache_flush_num
+			&& __ratelimit(&i8xx_chipset_flush_ratelimit_cpu),
+		"i8xx chipset flush failed, expected: %u, cpu_read: %u\n",
+		intel_private.i8xx_cache_flush_num, cpu_read);
+	WARN(gtt_read != intel_private.i8xx_cache_flush_num
+			&& __ratelimit(&i8xx_chipset_flush_ratelimit_gtt),
+		"i8xx chipset flush failed, expected: %u, gtt_read: %u\n",
+		intel_private.i8xx_cache_flush_num, gtt_read);
+
+	intel_private.i8xx_cache_flush_num++;
 }
 
 /* The intel i830 automatically initializes the agp aperture during POST.
@@ -883,6 +974,7 @@ static int intel_i830_configure(void)
 	global_cache_flush();
 
 	intel_i830_setup_flush();
+
 	return 0;
 }
 
diff --git a/include/drm/intel-gtt.h b/include/drm/intel-gtt.h
index 6cec6d2..ab4da67 100644
--- a/include/drm/intel-gtt.h
+++ b/include/drm/intel-gtt.h
@@ -2,5 +2,7 @@
  * drm module
  */
 
-/* This denotes how many pages intel-gtt steals at the end of the gart. */
-#define I830_CC_DANCE_PAGES 1
+/* This denotes how many pages intel-gtt steals at the end of the gart:
+ * one page to check cache coherency on i830 */
+#define I830_CC_DANCE_PAGES		1
+#define I830_MCH_WRITE_BUFFER_SIZE	1024
-- 
1.7.1