[PATCH 095/131] drm/i915: Micro-optimise gen6_ppgtt_insert_entries()

Chris Wilson chris at chris-wilson.co.uk
Sat Aug 6 07:37:02 UTC 2016


Inline the address computation to avoid the vfunc call for every page.
We still have to pay the high overhead of sg_page_iter_next(), but now
at least GCC can optimise the inner most loop, giving a significant
boost to some thrashing Unreal Engine workloads.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index fb543941578f..dc5295497f9d 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1815,28 +1815,22 @@ static void gen6_ppgtt_insert_entries(struct i915_address_space *vm,
 	struct i915_hw_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
 	unsigned first_entry = start >> PAGE_SHIFT;
 	unsigned act_pt = first_entry / GEN6_PTES;
-	unsigned act_pte = first_entry % GEN6_PTES;
-	gen6_pte_t *pt_vaddr = NULL;
+	unsigned act_pte = first_entry % GEN6_PTES - 1;
+	u32 pte_encode = vm->pte_encode(0, cache_level, true, flags);
 	struct sgt_iter sgt_iter;
+	gen6_pte_t *pt_vaddr;
 	dma_addr_t addr;
 
+	pt_vaddr = kmap_px(ppgtt->pd.page_table[act_pt]);
 	for_each_sgt_dma(addr, sgt_iter, pages) {
-		if (pt_vaddr == NULL)
-			pt_vaddr = kmap_px(ppgtt->pd.page_table[act_pt]);
-
-		pt_vaddr[act_pte] =
-			vm->pte_encode(addr, cache_level, true, flags);
-
 		if (++act_pte == GEN6_PTES) {
 			kunmap_px(ppgtt, pt_vaddr);
-			pt_vaddr = NULL;
-			act_pt++;
+			pt_vaddr = kmap_px(ppgtt->pd.page_table[++act_pt]);
 			act_pte = 0;
 		}
+		pt_vaddr[act_pte] = pte_encode | GEN6_PTE_ADDR_ENCODE(addr);
 	}
-
-	if (pt_vaddr)
-		kunmap_px(ppgtt, pt_vaddr);
+	kunmap_px(ppgtt, pt_vaddr);
 }
 
 static int gen6_alloc_va_range(struct i915_address_space *vm,
-- 
2.8.1



More information about the Intel-gfx-trybot mailing list