[Intel-gfx] [PATCH v3 03/23] drm/i915: Micro-optimise gen8_ppgtt_insert_entries()
Mika Kuoppala
mika.kuoppala at linux.intel.com
Tue Feb 14 16:00:55 UTC 2017
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Improve the sg iteration and in hte process eliminate a bug in
> miscomputing the pml4 length as orig_nents<<PAGE_SHIFT is no longer the
> full length of the sg table.
>
> v2: Check for the end of the fourth level page table (the final pdpe)
> and move onto the next.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_gem_gtt.c | 170 +++++++++++++++++++-----------------
> 1 file changed, 91 insertions(+), 79 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index ca1f5fa6984f..ddfb5963f521 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -751,9 +751,9 @@ static bool gen8_ppgtt_clear_pt(struct i915_address_space *vm,
> unsigned int num_entries = gen8_pte_count(start, length);
> unsigned int pte = gen8_pte_index(start);
> unsigned int pte_end = pte + num_entries;
> - gen8_pte_t *pt_vaddr;
> - gen8_pte_t scratch_pte = gen8_pte_encode(vm->scratch_page.daddr,
> - I915_CACHE_LLC);
> + const gen8_pte_t scratch_pte =
> + gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC);
> + gen8_pte_t *vaddr;
>
> if (WARN_ON(!px_page(pt)))
> return false;
> @@ -766,12 +766,10 @@ static bool gen8_ppgtt_clear_pt(struct i915_address_space *vm,
> return true;
> }
>
> - pt_vaddr = kmap_px(pt);
> -
> + vaddr = kmap_px(pt);
> while (pte < pte_end)
> - pt_vaddr[pte++] = scratch_pte;
> -
> - kunmap_px(ppgtt, pt_vaddr);
> + vaddr[pte++] = scratch_pte;
> + kunmap_px(ppgtt, vaddr);
>
> return false;
> }
> @@ -879,71 +877,98 @@ static void gen8_ppgtt_clear_range(struct i915_address_space *vm,
> gen8_ppgtt_clear_pdp(vm, &ppgtt->pdp, start, length);
> }
>
> -static void
> -gen8_ppgtt_insert_pte_entries(struct i915_address_space *vm,
> +struct sgt_dma {
> + struct scatterlist *sg;
> + dma_addr_t dma, max;
> +};
> +
> +static __always_inline bool
> +gen8_ppgtt_insert_pte_entries(struct i915_hw_ppgtt *ppgtt,
> struct i915_page_directory_pointer *pdp,
> - struct sg_page_iter *sg_iter,
> - uint64_t start,
> + struct sgt_dma *iter,
> + u64 start,
> enum i915_cache_level cache_level)
> {
> - struct i915_hw_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
> - gen8_pte_t *pt_vaddr;
> - unsigned pdpe = gen8_pdpe_index(start);
> - unsigned pde = gen8_pde_index(start);
> - unsigned pte = gen8_pte_index(start);
> + unsigned int pdpe = gen8_pdpe_index(start);
> + unsigned int pde = gen8_pde_index(start);
> + unsigned int pte = gen8_pte_index(start);
> + struct i915_page_directory *pd;
> + const gen8_pte_t pte_encode = gen8_pte_encode(0, cache_level);
> + gen8_pte_t *vaddr;
> + bool ret;
>
> - pt_vaddr = NULL;
> + pd = pdp->page_directory[pdpe];
> + vaddr = kmap_px(pd->page_table[pde]);
> + do {
> + vaddr[pte] = pte_encode | iter->dma;
> + iter->dma += PAGE_SIZE;
> + if (iter->dma >= iter->max) {
> + iter->sg = __sg_next(iter->sg);
> + if (!iter->sg) {
> + ret = false;
> + break;
> + }
>
> - while (__sg_page_iter_next(sg_iter)) {
> - if (pt_vaddr == NULL) {
> - struct i915_page_directory *pd = pdp->page_directory[pdpe];
> - struct i915_page_table *pt = pd->page_table[pde];
> - pt_vaddr = kmap_px(pt);
> + iter->dma = sg_dma_address(iter->sg);
> + iter->max = iter->dma + iter->sg->length;
> }
>
> - pt_vaddr[pte] =
> - gen8_pte_encode(sg_page_iter_dma_address(sg_iter),
> - cache_level);
> if (++pte == GEN8_PTES) {
> - kunmap_px(ppgtt, pt_vaddr);
> - pt_vaddr = NULL;
> if (++pde == I915_PDES) {
> - if (++pdpe == I915_PDPES_PER_PDP(vm->i915))
> + if (++pdpe == GEN8_PML4ES_PER_PML4) {
For the next reader, you could add a comment that we run out of
scatterlists elements on 32bit ppgtt (pages->sgl) before reaching here.
Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>
-Mika
> + ret = true;
> break;
> + }
> +
> + pd = pdp->page_directory[pdpe];
> pde = 0;
> }
> +
> + kunmap_px(ppgtt, vaddr);
> + vaddr = kmap_px(pd->page_table[pde]);
> pte = 0;
> }
> - }
> + } while (1);
> + kunmap_px(ppgtt, vaddr);
>
> - if (pt_vaddr)
> - kunmap_px(ppgtt, pt_vaddr);
> + return ret;
> }
>
> -static void gen8_ppgtt_insert_entries(struct i915_address_space *vm,
> - struct sg_table *pages,
> - uint64_t start,
> - enum i915_cache_level cache_level,
> - u32 unused)
> +static void gen8_ppgtt_insert_3lvl(struct i915_address_space *vm,
> + struct sg_table *pages,
> + u64 start,
> + enum i915_cache_level cache_level,
> + u32 unused)
> {
> struct i915_hw_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
> - struct sg_page_iter sg_iter;
> + struct sgt_dma iter = {
> + .sg = pages->sgl,
> + .dma = sg_dma_address(iter.sg),
> + .max = iter.dma + iter.sg->length,
> + };
>
> - __sg_page_iter_start(&sg_iter, pages->sgl, sg_nents(pages->sgl), 0);
> + gen8_ppgtt_insert_pte_entries(ppgtt, &ppgtt->pdp, &iter,
> + start, cache_level);
> +}
>
> - if (!USES_FULL_48BIT_PPGTT(vm->i915)) {
> - gen8_ppgtt_insert_pte_entries(vm, &ppgtt->pdp, &sg_iter, start,
> - cache_level);
> - } else {
> - struct i915_page_directory_pointer *pdp;
> - uint64_t pml4e;
> - uint64_t length = (uint64_t)pages->orig_nents << PAGE_SHIFT;
> +static void gen8_ppgtt_insert_4lvl(struct i915_address_space *vm,
> + struct sg_table *pages,
> + uint64_t start,
> + enum i915_cache_level cache_level,
> + u32 unused)
> +{
> + struct i915_hw_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
> + struct sgt_dma iter = {
> + .sg = pages->sgl,
> + .dma = sg_dma_address(iter.sg),
> + .max = iter.dma + iter.sg->length,
> + };
> + struct i915_page_directory_pointer **pdps = ppgtt->pml4.pdps;
> + unsigned int pml4e = gen8_pml4e_index(start);
>
> - gen8_for_each_pml4e(pdp, &ppgtt->pml4, start, length, pml4e) {
> - gen8_ppgtt_insert_pte_entries(vm, pdp, &sg_iter,
> - start, cache_level);
> - }
> - }
> + while (gen8_ppgtt_insert_pte_entries(ppgtt, pdps[pml4e++], &iter,
> + start, cache_level))
> + ;
> }
>
> static void gen8_free_page_tables(struct drm_i915_private *dev_priv,
> @@ -1525,8 +1550,8 @@ static void gen8_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
> struct i915_address_space *vm = &ppgtt->base;
> uint64_t start = ppgtt->base.start;
> uint64_t length = ppgtt->base.total;
> - gen8_pte_t scratch_pte = gen8_pte_encode(vm->scratch_page.daddr,
> - I915_CACHE_LLC);
> + const gen8_pte_t scratch_pte =
> + gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC);
>
> if (!USES_FULL_48BIT_PPGTT(vm->i915)) {
> gen8_dump_pdp(&ppgtt->pdp, start, length, scratch_pte, m);
> @@ -1591,7 +1616,6 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
> ppgtt->base.start = 0;
> ppgtt->base.cleanup = gen8_ppgtt_cleanup;
> ppgtt->base.allocate_va_range = gen8_alloc_va_range;
> - ppgtt->base.insert_entries = gen8_ppgtt_insert_entries;
> ppgtt->base.clear_range = gen8_ppgtt_clear_range;
> ppgtt->base.unbind_vma = ppgtt_unbind_vma;
> ppgtt->base.bind_vma = ppgtt_bind_vma;
> @@ -1606,6 +1630,8 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>
> ppgtt->base.total = 1ULL << 48;
> ppgtt->switch_mm = gen8_48b_mm_switch;
> +
> + ppgtt->base.insert_entries = gen8_ppgtt_insert_4lvl;
> } else {
> ret = __pdp_init(dev_priv, &ppgtt->pdp);
> if (ret)
> @@ -1622,6 +1648,8 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
> if (ret)
> goto free_scratch;
> }
> +
> + ppgtt->base.insert_entries = gen8_ppgtt_insert_3lvl;
> }
>
> if (intel_vgpu_active(dev_priv))
> @@ -1888,11 +1916,6 @@ static void gen6_ppgtt_clear_range(struct i915_address_space *vm,
> }
> }
>
> -struct sgt_dma {
> - struct scatterlist *sg;
> - dma_addr_t dma, max;
> -};
> -
> static void gen6_ppgtt_insert_entries(struct i915_address_space *vm,
> struct sg_table *pages,
> uint64_t start,
> @@ -2434,26 +2457,15 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm,
> struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
> struct sgt_iter sgt_iter;
> gen8_pte_t __iomem *gtt_entries;
> - gen8_pte_t gtt_entry;
> + const gen8_pte_t pte_encode = gen8_pte_encode(0, level);
> dma_addr_t addr;
> - int i = 0;
>
> - gtt_entries = (gen8_pte_t __iomem *)ggtt->gsm + (start >> PAGE_SHIFT);
> + gtt_entries = (gen8_pte_t __iomem *)ggtt->gsm;
> + gtt_entries += start >> PAGE_SHIFT;
> + for_each_sgt_dma(addr, sgt_iter, st)
> + gen8_set_pte(gtt_entries++, pte_encode | addr);
>
> - for_each_sgt_dma(addr, sgt_iter, st) {
> - gtt_entry = gen8_pte_encode(addr, level);
> - gen8_set_pte(>t_entries[i++], gtt_entry);
> - }
> -
> - /*
> - * XXX: This serves as a posting read to make sure that the PTE has
> - * actually been updated. There is some concern that even though
> - * registers and PTEs are within the same BAR that they are potentially
> - * of NUMA access patterns. Therefore, even with the way we assume
> - * hardware should work, we must keep this posting read for paranoia.
> - */
> - if (i != 0)
> - WARN_ON(readq(>t_entries[i-1]) != gtt_entry);
> + wmb();
>
> /* This next bit makes the above posting read even more important. We
> * want to flush the TLBs only after we're certain all the PTE updates
> @@ -2541,7 +2553,9 @@ static void gen8_ggtt_clear_range(struct i915_address_space *vm,
> struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
> unsigned first_entry = start >> PAGE_SHIFT;
> unsigned num_entries = length >> PAGE_SHIFT;
> - gen8_pte_t scratch_pte, __iomem *gtt_base =
> + const gen8_pte_t scratch_pte =
> + gen8_pte_encode(vm->scratch_page.daddr, I915_CACHE_LLC);
> + gen8_pte_t __iomem *gtt_base =
> (gen8_pte_t __iomem *)ggtt->gsm + first_entry;
> const int max_entries = ggtt_total_entries(ggtt) - first_entry;
> int i;
> @@ -2551,8 +2565,6 @@ static void gen8_ggtt_clear_range(struct i915_address_space *vm,
> first_entry, num_entries, max_entries))
> num_entries = max_entries;
>
> - scratch_pte = gen8_pte_encode(vm->scratch_page.daddr,
> - I915_CACHE_LLC);
> for (i = 0; i < num_entries; i++)
> gen8_set_pte(>t_base[i], scratch_pte);
> readl(gtt_base);
> --
> 2.11.0
More information about the Intel-gfx
mailing list