[Intel-gfx] [PATCH v2 10/24] drm/i915: Track GEN6 page table usage

Mon Jan 5 06:29:13 PST 2015

On Tue, Dec 23, 2014 at 05:16:13PM +0000, Michel Thierry wrote:
> From: Ben Widawsky <benjamin.widawsky at intel.com>
> 
> Instead of implementing the full tracking + dynamic allocation, this
> patch does a bit less than half of the work, by tracking and warning on
> unexpected conditions. The tracking itself follows which PTEs within a
> page table are currently being used for objects. The next patch will
> modify this to actually allocate the page tables only when necessary.
> 
> With the current patch there isn't much in the way of making a gen
> agnostic range allocation function. However, in the next patch we'll add
> more specificity which makes having separate functions a bit easier to
> manage.
> 
> One important change introduced here is that DMA mappings are
> created/destroyed at the same page directories/tables are
> allocated/deallocated.
> 
> Notice that aliasing PPGTT is not managed here. The patch which actually
> begins dynamic allocation/teardown explains the reasoning for this.
> 
> v2: s/pdp.pagedir/pdp.pagedirs
> Make a scratch page allocation helper
> 
> v3: Rebase and expand commit message.
> 
> v4: Allocate required pagetables only when it is needed, _bind_to_vm
> instead of bind_vma (Daniel).
> 
> Cc: Daniel Vetter <daniel at ffwll.ch>
> Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
> Signed-off-by: Michel Thierry <michel.thierry at intel.com> (v3+)

Imo still a bit too much rebase fluff in this patch. I think it would help
the patch clarity a lot of we'd split the changes to move around the
dma_map/unmap calls from the other parts of the patch to dynamically
allocate pagetables.

Bunch more comments below.
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_gem.c     |   9 ++
>  drivers/gpu/drm/i915/i915_gem_gtt.c | 277 ++++++++++++++++++++++++++----------
>  drivers/gpu/drm/i915/i915_gem_gtt.h | 149 ++++++++++++++-----
>  3 files changed, 322 insertions(+), 113 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2b6ecfd..5d52990 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3597,6 +3597,15 @@ search_free:
>  	if (ret)
>  		goto err_remove_node;
>  
> +	/*  allocate before insert / bind */
> +	if (vma->vm->allocate_va_range) {
> +		ret = vma->vm->allocate_va_range(vma->vm,
> +						vma->node.start,
> +						vma->node.size);
> +		if (ret)
> +			goto err_remove_node;
> +	}

Is this really the right patch for this hunk? The commit message sounds
like dynamic pagetable alloc is only partially implemented here ...

> +
>  	trace_i915_vma_bind(vma, flags);
>  	ret = i915_vma_bind(vma, obj->cache_level,
>  			    flags & PIN_GLOBAL ? GLOBAL_BIND : 0);
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 52bdde7..313432e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -138,10 +138,9 @@ static int sanitize_enable_ppgtt(struct drm_device *dev, int enable_ppgtt)
>  		return has_aliasing_ppgtt ? 1 : 0;
>  }
>  
> -
>  static void ppgtt_bind_vma(struct i915_vma *vma,
> -			   enum i915_cache_level cache_level,
> -			   u32 flags);
> +			  enum i915_cache_level cache_level,
> +			  u32 flags);
>  static void ppgtt_unbind_vma(struct i915_vma *vma);
>  
>  static inline gen8_gtt_pte_t gen8_pte_encode(dma_addr_t addr,
> @@ -275,27 +274,99 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
>  	return pte;
>  }
>  
> -static void free_pt_single(struct i915_pagetab *pt)
> -{
> +#define i915_dma_unmap_single(px, dev) do { \
> +	pci_unmap_page((dev)->pdev, (px)->daddr, 4096, PCI_DMA_BIDIRECTIONAL); \
> +} while (0);
> +
> +/**
> + * i915_dma_map_px_single() - Create a dma mapping for a page table/dir/etc.
> + * @px:		Page table/dir/etc to get a DMA map for
> + * @dev:	drm device
> + *
> + * Page table allocations are unified across all gens. They always require a
> + * single 4k allocation, as well as a DMA mapping. If we keep the structs
> + * symmetric here, the simple macro covers us for every page table type.
> + *
> + * Return: 0 if success.
> + */
> +#define i915_dma_map_px_single(px, dev) \
> +	pci_dma_mapping_error((dev)->pdev, \
> +			      (px)->daddr = pci_map_page((dev)->pdev, \
> +							 (px)->page, 0, 4096, \
> +							 PCI_DMA_BIDIRECTIONAL))

Linux coding style discourages macro abuse like this, please make this a
static inline instead. Otoh I don't really see the value in hiding the
pci_map_page call, imo open-codeing this is totally ok.

But while you touch the code please switch away from the pci_map wrappers
and use the dma_map functions directly.

> +
> +static void __free_pt_single(struct i915_pagetab *pt, struct drm_device *dev,
> +			     int scratch)
> +{
> +	if (WARN(scratch ^ pt->scratch,
> +		 "Tried to free scratch = %d. Is scratch = %d\n",
> +		 scratch, pt->scratch))
> +		return;
> +
>  	if (WARN_ON(!pt->page))
>  		return;
> +
> +	if (!scratch) {
> +		const size_t count = INTEL_INFO(dev)->gen >= 8 ?
> +			GEN8_PTES_PER_PAGE : I915_PPGTT_PT_ENTRIES;
> +		WARN(!bitmap_empty(pt->used_ptes, count),
> +		     "Free page table with %d used pages\n",
> +		     bitmap_weight(pt->used_ptes, count));
> +	}
> +
> +	i915_dma_unmap_single(pt, dev);
>  	__free_page(pt->page);
> +	kfree(pt->used_ptes);
>  	kfree(pt);
>  }
>  
> -static struct i915_pagetab *alloc_pt_single(void)
> +#define free_pt_single(pt, dev) \
> +	__free_pt_single(pt, dev, false)
> +#define free_pt_scratch(pt, dev) \
> +	__free_pt_single(pt, dev, true)

Imo the disdinction between _single and _scracth is confusing. Instead
adding a new functions unmap_and_free_pt which calls dma_unmap_page and
free_pt_single would make more sense. Then there's also no need for the __
version of the function and the scratch parameter.

It means that we'll need to kill the selfchecks for scratch, but by
uncluttering the indirections here a bit as proposed any such bugs should
be obvious. All the others will be caught by the dma mapping debugging
code (which is super-paranoid afaik).

> +
> +static struct i915_pagetab *alloc_pt_single(struct drm_device *dev)
>  {
>  	struct i915_pagetab *pt;
> +	const size_t count = INTEL_INFO(dev)->gen >= 8 ?
> +		GEN8_PTES_PER_PAGE : I915_PPGTT_PT_ENTRIES;
> +	int ret = -ENOMEM;
>  
>  	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
>  	if (!pt)
>  		return ERR_PTR(-ENOMEM);
>  
> +	pt->used_ptes = kcalloc(BITS_TO_LONGS(count), sizeof(*pt->used_ptes),
> +				GFP_KERNEL);
> +

I don't see the value in tracking used_ptes. For debugging we can just
look at the pte value itself (which should be in cached memory, so dirt
cheap). And since pagetables are the lowest level we can't screw up the
allocations/freeing. What do I miss?

> +	if (!pt->used_ptes)
> +		goto fail_bitmap;
> +
>  	pt->page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> -	if (!pt->page) {
> -		kfree(pt);
> -		return ERR_PTR(-ENOMEM);
> -	}
> +	if (!pt->page)
> +		goto fail_page;
> +
> +	ret = i915_dma_map_px_single(pt, dev);
> +	if (ret)
> +		goto fail_dma;
> +
> +	return pt;
> +
> +fail_dma:
> +	__free_page(pt->page);
> +fail_page:
> +	kfree(pt->used_ptes);
> +fail_bitmap:
> +	kfree(pt);
> +
> +	return ERR_PTR(ret);
> +}
> +
> +static inline struct i915_pagetab *alloc_pt_scratch(struct drm_device *dev)
> +{
> +	struct i915_pagetab *pt = alloc_pt_single(dev);
> +	if (!IS_ERR(pt))
> +		pt->scratch = 1;

Shouldn't we fill the scratch pt with scratch pte entries? Or do I miss
the point of this? Hard to tell without having users of scratch_pt in the
same patch as the patch as the one that adds them. Might be good to split
this in yet another patc.

>  
>  	return pt;
>  }
> @@ -313,7 +384,9 @@ static struct i915_pagetab *alloc_pt_single(void)
>   *
>   * Return: 0 if allocation succeeded.
>   */
> -static int alloc_pt_range(struct i915_pagedir *pd, uint16_t pde, size_t count)
> +static int alloc_pt_range(struct i915_pagedir *pd, uint16_t pde, size_t count,
> +		  struct drm_device *dev)

Aside: The reason I've suggested to split the patches is so that we can
get all the added *dev parameters out of the diff. That should help patch
readablity a lot.

> +
>  {
>  	int i, ret;
>  
> @@ -323,7 +396,7 @@ static int alloc_pt_range(struct i915_pagedir *pd, uint16_t pde, size_t count)
>  	BUG_ON(pde + count > GEN6_PPGTT_PD_ENTRIES);
>  
>  	for (i = pde; i < pde + count; i++) {
> -		struct i915_pagetab *pt = alloc_pt_single();
> +		struct i915_pagetab *pt = alloc_pt_single(dev);
>  		if (IS_ERR(pt)) {
>  			ret = PTR_ERR(pt);
>  			goto err_out;
> @@ -338,7 +411,7 @@ static int alloc_pt_range(struct i915_pagedir *pd, uint16_t pde, size_t count)
>  
>  err_out:
>  	while (i--)
> -		free_pt_single(pd->page_tables[i]);
> +		free_pt_single(pd->page_tables[i], dev);
>  	return ret;
>  }
>  
> @@ -506,7 +579,7 @@ static void gen8_ppgtt_insert_entries(struct i915_address_space *vm,
>  	}
>  }
>  
> -static void gen8_free_page_tables(struct i915_pagedir *pd)
> +static void gen8_free_page_tables(struct i915_pagedir *pd, struct drm_device *dev)
>  {
>  	int i;
>  
> @@ -514,7 +587,7 @@ static void gen8_free_page_tables(struct i915_pagedir *pd)
>  		return;
>  
>  	for (i = 0; i < GEN8_PDES_PER_PAGE; i++) {
> -		free_pt_single(pd->page_tables[i]);
> +		free_pt_single(pd->page_tables[i], dev);
>  		pd->page_tables[i] = NULL;
>  	}
>  }
> @@ -524,7 +597,7 @@ static void gen8_ppgtt_free(struct i915_hw_ppgtt *ppgtt)
>  	int i;
>  
>  	for (i = 0; i < ppgtt->num_pd_pages; i++) {
> -		gen8_free_page_tables(ppgtt->pdp.pagedir[i]);
> +		gen8_free_page_tables(ppgtt->pdp.pagedir[i], ppgtt->base.dev);
>  		free_pd_single(ppgtt->pdp.pagedir[i]);
>  	}
>  }
> @@ -569,7 +642,7 @@ static int gen8_ppgtt_allocate_page_tables(struct i915_hw_ppgtt *ppgtt)
>  
>  	for (i = 0; i < ppgtt->num_pd_pages; i++) {
>  		ret = alloc_pt_range(ppgtt->pdp.pagedir[i],
> -				     0, GEN8_PDES_PER_PAGE);
> +				     0, GEN8_PDES_PER_PAGE, ppgtt->base.dev);
>  		if (ret)
>  			goto unwind_out;
>  	}
> @@ -578,7 +651,7 @@ static int gen8_ppgtt_allocate_page_tables(struct i915_hw_ppgtt *ppgtt)
>  
>  unwind_out:
>  	while (i--)
> -		gen8_free_page_tables(ppgtt->pdp.pagedir[i]);
> +		gen8_free_page_tables(ppgtt->pdp.pagedir[i], ppgtt->base.dev);
>  
>  	return -ENOMEM;
>  }
> @@ -808,26 +881,36 @@ static void gen6_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
>  	}
>  }
>  
> -static void gen6_write_pdes(struct i915_hw_ppgtt *ppgtt)
> +/* Write pde (index) from the page directory @pd to the page table @pt */
> +static void gen6_write_pdes(struct i915_pagedir *pd,
> +			    const int pde, struct i915_pagetab *pt)
>  {
> -	struct drm_i915_private *dev_priv = ppgtt->base.dev->dev_private;
> -	gen6_gtt_pte_t __iomem *pd_addr;
> -	uint32_t pd_entry;
> -	int i;
> +	struct i915_hw_ppgtt *ppgtt =
> +		container_of(pd, struct i915_hw_ppgtt, pd);
> +	u32 pd_entry;
>  
> -	WARN_ON(ppgtt->pd.pd_offset & 0x3f);
> -	pd_addr = (gen6_gtt_pte_t __iomem*)dev_priv->gtt.gsm +
> -		ppgtt->pd.pd_offset / sizeof(gen6_gtt_pte_t);
> -	for (i = 0; i < ppgtt->num_pd_entries; i++) {
> -		dma_addr_t pt_addr;
> +	pd_entry = GEN6_PDE_ADDR_ENCODE(pt->daddr);
> +	pd_entry |= GEN6_PDE_VALID;
>  
> -		pt_addr = ppgtt->pd.page_tables[i]->daddr;
> -		pd_entry = GEN6_PDE_ADDR_ENCODE(pt_addr);
> -		pd_entry |= GEN6_PDE_VALID;
> +	writel(pd_entry, ppgtt->pd_addr + pde);
>  
> -		writel(pd_entry, pd_addr + i);
> -	}
> -	readl(pd_addr);
> +	/* XXX: Caller needs to make sure the write completes if necessary */

Please take care of such XXX comments. That's the stuff I've meant when
I've said that the overall patch series needs a full pass with a critical
eye to catch development/rebase leftovers.

> +}
> +
> +/* Write all the page tables found in the ppgtt structure to incrementing page
> + * directories. */
> +static void gen6_write_page_range(struct drm_i915_private *dev_priv,
> +				struct i915_pagedir *pd, uint32_t start, uint32_t length)
> +{
> +	struct i915_pagetab *pt;
> +	uint32_t pde, temp;
> +
> +	gen6_for_each_pde(pt, pd, start, length, temp, pde)
> +		gen6_write_pdes(pd, pde, pt);
> +
> +	/* Make sure write is complete before other code can use this page
> +	 * table. Also require for WC mapped PTEs */
> +	readl(dev_priv->gtt.gsm);
>  }
>  
>  static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
> @@ -1043,13 +1126,59 @@ static void gen6_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
>  			       4096, PCI_DMA_BIDIRECTIONAL);
>  }
>  
> +static int gen6_alloc_va_range(struct i915_address_space *vm,
> +			       uint64_t start, uint64_t length)
> +{
> +	struct i915_hw_ppgtt *ppgtt =
> +				container_of(vm, struct i915_hw_ppgtt, base);
> +	struct i915_pagetab *pt;
> +	uint32_t pde, temp;
> +
> +	gen6_for_each_pde(pt, &ppgtt->pd, start, length, temp, pde) {
> +		int j;
> +
> +		DECLARE_BITMAP(tmp_bitmap, I915_PPGTT_PT_ENTRIES);
> +		bitmap_zero(tmp_bitmap, I915_PPGTT_PT_ENTRIES);
> +		bitmap_set(tmp_bitmap, gen6_pte_index(start),
> +			   gen6_pte_count(start, length));
> +
> +		/* TODO: To be done in the next patch. Map the page/insert
> +		 * entries here */
> +		for_each_set_bit(j, tmp_bitmap, I915_PPGTT_PT_ENTRIES) {
> +			if (test_bit(j, pt->used_ptes)) {
> +				/* Check that we're changing cache levels */

Again something only valid from older revisions since we've taken care of
the cache_level changes by only extendeding pagetables where actually
needed (in the bind functions). Furthermore with the used_ptes bitmask
gone this would all disappear anyway.

Given that the patch justifies itself by adding the dynamic allocation and
tracking first to debug it, but doesn't add any self-checks in the pte
writing funcs (afaics at least) I don't think this is all that useful any
more.

> +			}
> +		}
> +
> +		bitmap_or(pt->used_ptes, pt->used_ptes, tmp_bitmap,
> +				I915_PPGTT_PT_ENTRIES);
> +	}
> +
> +	return 0;
> +}
> +
> +static void gen6_teardown_va_range(struct i915_address_space *vm,
> +				   uint64_t start, uint64_t length)
> +{
> +	struct i915_hw_ppgtt *ppgtt =
> +				container_of(vm, struct i915_hw_ppgtt, base);
> +	struct i915_pagetab *pt;
> +	uint32_t pde, temp;
> +
> +	gen6_for_each_pde(pt, &ppgtt->pd, start, length, temp, pde) {
> +		bitmap_clear(pt->used_ptes, gen6_pte_index(start),
> +			     gen6_pte_count(start, length));
> +	}
> +}
> +
>  static void gen6_ppgtt_free(struct i915_hw_ppgtt *ppgtt)
>  {
>  	int i;
>  
>  	for (i = 0; i < ppgtt->num_pd_entries; i++)
> -		free_pt_single(ppgtt->pd.page_tables[i]);
> +		free_pt_single(ppgtt->pd.page_tables[i], ppgtt->base.dev);
>  
> +	free_pt_scratch(ppgtt->scratch_pt, ppgtt->base.dev);
>  	free_pd_single(&ppgtt->pd);
>  }
>  
> @@ -1076,6 +1205,9 @@ static int gen6_ppgtt_allocate_page_directories(struct i915_hw_ppgtt *ppgtt)
>  	 * size. We allocate at the top of the GTT to avoid fragmentation.
>  	 */
>  	BUG_ON(!drm_mm_initialized(&dev_priv->gtt.base.mm));
> +	ppgtt->scratch_pt = alloc_pt_scratch(ppgtt->base.dev);
> +	if (IS_ERR(ppgtt->scratch_pt))
> +		return PTR_ERR(ppgtt->scratch_pt);
>  alloc:
>  	ret = drm_mm_insert_node_in_range_generic(&dev_priv->gtt.base.mm,
>  						  &ppgtt->node, GEN6_PD_SIZE,
> @@ -1089,20 +1221,25 @@ alloc:
>  					       0, dev_priv->gtt.base.total,
>  					       0);
>  		if (ret)
> -			return ret;
> +			goto err_out;
>  
>  		retried = true;
>  		goto alloc;
>  	}
>  
>  	if (ret)
> -		return ret;
> +		goto err_out;
> +
>  
>  	if (ppgtt->node.start < dev_priv->gtt.mappable_end)
>  		DRM_DEBUG("Forced to use aperture for PDEs\n");
>  
>  	ppgtt->num_pd_entries = GEN6_PPGTT_PD_ENTRIES;
>  	return 0;
> +
> +err_out:
> +	free_pt_scratch(ppgtt->scratch_pt, ppgtt->base.dev);
> +	return ret;
>  }
>  
>  static int gen6_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt)
> @@ -1113,7 +1250,9 @@ static int gen6_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt)
>  	if (ret)
>  		return ret;
>  
> -	ret = alloc_pt_range(&ppgtt->pd, 0, ppgtt->num_pd_entries);
> +	ret = alloc_pt_range(&ppgtt->pd, 0, ppgtt->num_pd_entries,
> +			ppgtt->base.dev);
> +
>  	if (ret) {
>  		drm_mm_remove_node(&ppgtt->node);
>  		return ret;
> @@ -1122,30 +1261,6 @@ static int gen6_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt)
>  	return 0;
>  }
>  
> -static int gen6_ppgtt_setup_page_tables(struct i915_hw_ppgtt *ppgtt)
> -{
> -	struct drm_device *dev = ppgtt->base.dev;
> -	int i;
> -
> -	for (i = 0; i < ppgtt->num_pd_entries; i++) {
> -		struct page *page;
> -		dma_addr_t pt_addr;
> -
> -		page = ppgtt->pd.page_tables[i]->page;
> -		pt_addr = pci_map_page(dev->pdev, page, 0, 4096,
> -				       PCI_DMA_BIDIRECTIONAL);
> -
> -		if (pci_dma_mapping_error(dev->pdev, pt_addr)) {
> -			gen6_ppgtt_unmap_pages(ppgtt);
> -			return -EIO;
> -		}
> -
> -		ppgtt->pd.page_tables[i]->daddr = pt_addr;
> -	}
> -
> -	return 0;
> -}
> -
>  static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
> @@ -1166,12 +1281,8 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>  	if (ret)
>  		return ret;
>  
> -	ret = gen6_ppgtt_setup_page_tables(ppgtt);
> -	if (ret) {
> -		gen6_ppgtt_free(ppgtt);
> -		return ret;
> -	}
> -
> +	ppgtt->base.allocate_va_range = gen6_alloc_va_range;
> +	ppgtt->base.teardown_va_range = gen6_teardown_va_range;
>  	ppgtt->base.clear_range = gen6_ppgtt_clear_range;
>  	ppgtt->base.insert_entries = gen6_ppgtt_insert_entries;
>  	ppgtt->base.cleanup = gen6_ppgtt_cleanup;
> @@ -1182,11 +1293,15 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>  	ppgtt->pd.pd_offset =
>  		ppgtt->node.start / PAGE_SIZE * sizeof(gen6_gtt_pte_t);
>  
> +	ppgtt->pd_addr = (gen6_gtt_pte_t __iomem *)dev_priv->gtt.gsm +
> +		ppgtt->pd.pd_offset / sizeof(gen6_gtt_pte_t);
> +
> +	gen6_write_page_range(dev_priv, &ppgtt->pd, 0, ppgtt->base.total);
> +
>  	DRM_DEBUG_DRIVER("Allocated pde space (%ldM) at GTT entry: %lx\n",
>  			 ppgtt->node.size >> 20,
>  			 ppgtt->node.start / PAGE_SIZE);
>  
> -	gen6_write_pdes(ppgtt);
>  	DRM_DEBUG("Adding PPGTT at offset %x\n",
>  		  ppgtt->pd.pd_offset << 10);
>  
> @@ -1318,6 +1433,9 @@ static void ppgtt_unbind_vma(struct i915_vma *vma)
>  			     vma->node.start,
>  			     vma->obj->base.size,
>  			     true);
> +	if (vma->vm->teardown_va_range)
> +		vma->vm->teardown_va_range(vma->vm,
> +					   vma->node.start, vma->node.size);

If we ditch unsed_ptes we can ditch this here too. As per my irc
discussion with Chris Wilson I think the best way to actually free
pagetables is in our shrinker, by simply freeing them all when a vm
contains no bound buffer at all. Much less fuzz and avoids all the
complications that ripping out pagetables from underneath active users
might entail. On the cpu side the core vm doesn't even bother with that,
so I expect we can forgoe that complexity on the gpu side, too.

>  }
>  
>  extern int intel_iommu_gfx_mapped;
> @@ -1461,13 +1579,14 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
>  
>  	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
>  		/* TODO: Perhaps it shouldn't be gen6 specific */
> -		if (i915_is_ggtt(vm)) {
> -			if (dev_priv->mm.aliasing_ppgtt)
> -				gen6_write_pdes(dev_priv->mm.aliasing_ppgtt);
> -			continue;
> -		}
>  
> -		gen6_write_pdes(container_of(vm, struct i915_hw_ppgtt, base));
> +		struct i915_hw_ppgtt *ppgtt =
> +			container_of(vm, struct i915_hw_ppgtt, base);
> +
> +		if (i915_is_ggtt(vm))
> +			ppgtt = dev_priv->mm.aliasing_ppgtt;
> +
> +		gen6_write_page_range(dev_priv, &ppgtt->pd, 0, ppgtt->num_pd_entries);
>  	}
>  
>  	i915_ggtt_flush(dev_priv);
> @@ -1633,8 +1752,8 @@ static void gen6_ggtt_clear_range(struct i915_address_space *vm,
>  
>  
>  static void i915_ggtt_bind_vma(struct i915_vma *vma,
> -			       enum i915_cache_level cache_level,
> -			       u32 unused)
> +			      enum i915_cache_level cache_level,
> +			      u32 unused)
>  {
>  	const unsigned long entry = vma->node.start >> PAGE_SHIFT;
>  	unsigned int flags = (cache_level == I915_CACHE_NONE) ?
> @@ -1666,8 +1785,8 @@ static void i915_ggtt_unbind_vma(struct i915_vma *vma)
>  }
>  
>  static void ggtt_bind_vma(struct i915_vma *vma,
> -			  enum i915_cache_level cache_level,
> -			  u32 flags)
> +			 enum i915_cache_level cache_level,
> +			 u32 flags)
>  {
>  	struct drm_device *dev = vma->vm->dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index c08fe8b..d579f74 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -54,7 +54,10 @@ typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
>  #define GEN6_PPGTT_PD_ENTRIES		512
>  #define GEN6_PD_SIZE			(GEN6_PPGTT_PD_ENTRIES * PAGE_SIZE)
>  #define GEN6_PD_ALIGN			(PAGE_SIZE * 16)
> +#define GEN6_PDE_SHIFT          22
>  #define GEN6_PDE_VALID			(1 << 0)
> +#define GEN6_PDE_MASK			(GEN6_PPGTT_PD_ENTRIES-1)
> +#define NUM_PTE(pde_shift)		(1 << (pde_shift - PAGE_SHIFT))
>  
>  #define GEN7_PTE_CACHE_L3_LLC		(3 << 1)
>  
> @@ -183,8 +186,32 @@ struct i915_vma {
>  	void (*unbind_vma)(struct i915_vma *vma);
>  	/* Map an object into an address space with the given cache flags. */
>  	void (*bind_vma)(struct i915_vma *vma,
> -			 enum i915_cache_level cache_level,
> -			 u32 flags);
> +			enum i915_cache_level cache_level,
> +			u32 flags);
> +};
> +
> +
> +struct i915_pagetab {
> +	struct page *page;
> +	dma_addr_t daddr;
> +
> +	unsigned long *used_ptes;
> +	unsigned int scratch:1;
> +};

there's a bit of noise in the diff because you move around structures. Imo
just using forward decls is better, or reorder the definitions in the
patch that adds them.

> +
> +struct i915_pagedir {
> +	struct page *page; /* NULL for GEN6-GEN7 */
> +	union {
> +		uint32_t pd_offset;
> +		dma_addr_t daddr;
> +	};
> +
> +	struct i915_pagetab *page_tables[GEN6_PPGTT_PD_ENTRIES];
> +};
> +
> +struct i915_pagedirpo {
> +	/* struct page *page; */
> +	struct i915_pagedir *pagedir[GEN8_LEGACY_PDPES];
>  };
>  
>  struct i915_address_space {
> @@ -226,6 +253,12 @@ struct i915_address_space {
>  	gen6_gtt_pte_t (*pte_encode)(dma_addr_t addr,
>  				     enum i915_cache_level level,
>  				     bool valid, u32 flags); /* Create a valid PTE */
> +	int (*allocate_va_range)(struct i915_address_space *vm,
> +				 uint64_t start,
> +				 uint64_t length);
> +	void (*teardown_va_range)(struct i915_address_space *vm,
> +				  uint64_t start,
> +				  uint64_t length);
>  	void (*clear_range)(struct i915_address_space *vm,
>  			    uint64_t start,
>  			    uint64_t length,
> @@ -237,6 +270,29 @@ struct i915_address_space {
>  	void (*cleanup)(struct i915_address_space *vm);
>  };
>  
> +struct i915_hw_ppgtt {
> +	struct i915_address_space base;
> +	struct kref ref;
> +	struct drm_mm_node node;
> +	unsigned num_pd_entries;
> +	unsigned num_pd_pages; /* gen8+ */
> +	union {
> +		struct i915_pagedirpo pdp;
> +		struct i915_pagedir pd;
> +	};
> +
> +	struct i915_pagetab *scratch_pt;
> +
> +	struct drm_i915_file_private *file_priv;
> +
> +	gen6_gtt_pte_t __iomem *pd_addr;
> +
> +	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> +	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> +			 struct intel_engine_cs *ring);
> +	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
> +};
> +
>  /* The Graphics Translation Table is the way in which GEN hardware translates a
>   * Graphics Virtual Address into a Physical Address. In addition to the normal
>   * collateral associated with any va->pa translations GEN hardware also has a
> @@ -265,44 +321,69 @@ struct i915_gtt {
>  			  unsigned long *mappable_end);
>  };
>  
> -struct i915_pagetab {
> -	struct page *page;
> -	dma_addr_t daddr;
> -};
> +/* For each pde iterates over every pde between from start until start + length.
> + * If start, and start+length are not perfectly divisible, the macro will round
> + * down, and up as needed. The macro modifies pde, start, and length. Dev is
> + * only used to differentiate shift values. Temp is temp.  On gen6/7, start = 0,
> + * and length = 2G effectively iterates over every PDE in the system. On gen8+
> + * it simply iterates over every page directory entry in a page directory.
> + *
> + * XXX: temp is not actually needed, but it saves doing the ALIGN operation.
> + */
> +#define gen6_for_each_pde(pt, pd, start, length, temp, iter) \
> +	for (iter = gen6_pde_index(start), pt = (pd)->page_tables[iter]; \
> +	     length > 0 && iter < GEN6_PPGTT_PD_ENTRIES; \
> +	     pt = (pd)->page_tables[++iter], \
> +	     temp = ALIGN(start+1, 1 << GEN6_PDE_SHIFT) - start, \
> +	     temp = min(temp, (unsigned)length), \
> +	     start += temp, length -= temp)
> +
> +static inline uint32_t i915_pte_index(uint64_t address, uint32_t pde_shift)
> +{
> +	const uint32_t mask = NUM_PTE(pde_shift) - 1;
> +	return (address >> PAGE_SHIFT) & mask;
> +}
>  
> -struct i915_pagedir {
> -	struct page *page; /* NULL for GEN6-GEN7 */
> -	union {
> -		uint32_t pd_offset;
> -		dma_addr_t daddr;
> -	};
> +/* Helper to counts the number of PTEs within the given length. This count does
> +* not cross a page table boundary, so the max value would be
> +* I915_PPGTT_PT_ENTRIES for GEN6, and GEN8_PTES_PER_PAGE for GEN8.
> +*/
> +static inline size_t i915_pte_count(uint64_t addr, size_t length,
> +					uint32_t pde_shift)
> +{
> +	const uint64_t mask = ~((1 << pde_shift) - 1);
> +	uint64_t end;
>  
> -	struct i915_pagetab *page_tables[GEN6_PPGTT_PD_ENTRIES]; /* PDEs */
> -};
> +	BUG_ON(length == 0);
> +	BUG_ON(offset_in_page(addr|length));
>  
> -struct i915_pagedirpo {
> -	/* struct page *page; */
> -	struct i915_pagedir *pagedir[GEN8_LEGACY_PDPES];
> -};
> +	end = addr + length;
>  
> -struct i915_hw_ppgtt {
> -	struct i915_address_space base;
> -	struct kref ref;
> -	struct drm_mm_node node;
> -	unsigned num_pd_entries;
> -	unsigned num_pd_pages; /* gen8+ */
> -	union {
> -		struct i915_pagedirpo pdp;
> -		struct i915_pagedir pd;
> -	};
> +	if ((addr & mask) != (end & mask))
> +		return NUM_PTE(pde_shift) - i915_pte_index(addr, pde_shift);
>  
> -	struct drm_i915_file_private *file_priv;
> +	return i915_pte_index(end, pde_shift) - i915_pte_index(addr, pde_shift);
> +}
>  
> -	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> -	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring);
> -	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
> -};
> +static inline uint32_t i915_pde_index(uint64_t addr, uint32_t shift)

Shouldn't this have a gen6 prefix, too?

> +{
> +	return (addr >> shift) & GEN6_PDE_MASK;
> +}
> +
> +static inline uint32_t gen6_pte_index(uint32_t addr)
> +{
> +	return i915_pte_index(addr, GEN6_PDE_SHIFT);
> +}
> +
> +static inline size_t gen6_pte_count(uint32_t addr, uint32_t length)
> +{
> +	return i915_pte_count(addr, length, GEN6_PDE_SHIFT);
> +}
> +
> +static inline uint32_t gen6_pde_index(uint32_t addr)
> +{
> +	return i915_pde_index(addr, GEN6_PDE_SHIFT);
> +}
>  
>  int i915_gem_gtt_init(struct drm_device *dev);
>  void i915_gem_init_global_gtt(struct drm_device *dev);
> -- 
> 2.1.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch