[PATCH 66/78] drm/i915: Micro-optimise gen6_ppgtt_insert_entries()

Sun Jan 1 13:33:55 UTC 2017

Inline the address computation to avoid the vfunc call for every page.
We still have to pay the high overhead of sg_page_iter_next(), but now
at least GCC can optimise the inner most loop, giving a significant
boost to some thrashing Unreal Engine workloads.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 47 ++++++++++---------------------------
 1 file changed, 13 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index bf4c247b697d..c4178c83181b 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1880,28 +1880,22 @@ static void gen6_ppgtt_insert_entries(struct i915_address_space *vm,
 	struct i915_hw_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
 	unsigned first_entry = start >> PAGE_SHIFT;
 	unsigned act_pt = first_entry / GEN6_PTES;
-	unsigned act_pte = first_entry % GEN6_PTES;
-	gen6_pte_t *pt_vaddr = NULL;
+	unsigned act_pte = first_entry % GEN6_PTES - 1;
+	u32 pte_encode = vm->pte_encode(0, cache_level, flags);
 	struct sgt_iter sgt_iter;
+	gen6_pte_t *pt_vaddr;
 	dma_addr_t addr;
 
+	pt_vaddr = kmap_px(ppgtt->pd.page_table[act_pt]);
 	for_each_sgt_dma(addr, sgt_iter, pages) {
-		if (pt_vaddr == NULL)
-			pt_vaddr = kmap_px(ppgtt->pd.page_table[act_pt]);
-
-		pt_vaddr[act_pte] =
-			vm->pte_encode(addr, cache_level, flags);
-
 		if (++act_pte == GEN6_PTES) {
 			kunmap_px(ppgtt, pt_vaddr);
-			pt_vaddr = NULL;
-			act_pt++;
+			pt_vaddr = kmap_px(ppgtt->pd.page_table[++act_pt]);
 			act_pte = 0;
 		}
+		pt_vaddr[act_pte] = pte_encode | GEN6_PTE_ADDR_ENCODE(addr);
 	}
-
-	if (pt_vaddr)
-		kunmap_px(ppgtt, pt_vaddr);
+	kunmap_px(ppgtt, pt_vaddr);
 }
 
 static int gen6_alloc_va_range(struct i915_address_space *vm,
@@ -2500,34 +2494,19 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm,
 {
 	struct drm_i915_private *dev_priv = vm->i915;
 	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
-	struct sgt_iter sgt_iter;
-	gen6_pte_t __iomem *gtt_entries;
-	gen6_pte_t gtt_entry;
+	gen6_pte_t __iomem *entries = (gen6_pte_t __iomem *)ggtt->gsm;
+	unsigned i = start >> PAGE_SHIFT;
+	struct sgt_iter iter;
 	dma_addr_t addr;
-	int i = 0;
-
-	gtt_entries = (gen6_pte_t __iomem *)ggtt->gsm + (start >> PAGE_SHIFT);
-
-	for_each_sgt_dma(addr, sgt_iter, st) {
-		gtt_entry = vm->pte_encode(addr, level, flags);
-		iowrite32(gtt_entry, &gtt_entries[i++]);
-	}
-
-	/* XXX: This serves as a posting read to make sure that the PTE has
-	 * actually been updated. There is some concern that even though
-	 * registers and PTEs are within the same BAR that they are potentially
-	 * of NUMA access patterns. Therefore, even with the way we assume
-	 * hardware should work, we must keep this posting read for paranoia.
-	 */
-	if (i != 0)
-		WARN_ON(readl(&gtt_entries[i-1]) != gtt_entry);
+	for_each_sgt_dma(addr, iter, st)
+		iowrite32(vm->pte_encode(addr, level, flags), &entries[i++]);
+	wmb();
 
 	/* This next bit makes the above posting read even more important. We
 	 * want to flush the TLBs only after we're certain all the PTE updates
 	 * have finished.
 	 */
 	I915_WRITE(GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN);
-	POSTING_READ(GFX_FLSH_CNTL_GEN6);
 }
 
 static void nop_clear_range(struct i915_address_space *vm,
-- 
2.11.0