[Intel-gfx] [PATCH 06/23] drm/i915: Preallocate stashes for vma page-directories

Fri Jul 3 16:47:21 UTC 2020

On 02/07/2020 09:32, Chris Wilson wrote:
> We need to make the DMA allocations used for page directories to be
> performed up front so that we can include those allocations in our
> memory reservation pass. The downside is that we have to assume the
> worst case, even before we know the final layout, and always allocate
> enough page directories for this object, even when there will be overlap.
> 
> It should be noted that the lifetime for the page directories DMA is
> more or less decoupled from individual fences as they will be shared
> across objects across timelines.

Why specifically you are pointing this out?

> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   .../gpu/drm/i915/gem/i915_gem_client_blt.c    | 11 +--
>   drivers/gpu/drm/i915/gt/gen6_ppgtt.c          | 38 +++------
>   drivers/gpu/drm/i915/gt/gen8_ppgtt.c          | 77 +++++-------------
>   drivers/gpu/drm/i915/gt/intel_ggtt.c          | 45 +++++------
>   drivers/gpu/drm/i915/gt/intel_gtt.h           | 39 ++++++---
>   drivers/gpu/drm/i915/gt/intel_ppgtt.c         | 80 ++++++++++++++++---
>   drivers/gpu/drm/i915/i915_vma.c               | 29 ++++---
>   drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 60 ++++++++------
>   drivers/gpu/drm/i915/selftests/mock_gtt.c     | 22 ++---
>   9 files changed, 224 insertions(+), 177 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> index 278664f831e7..947c8aa8e13e 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> @@ -32,12 +32,13 @@ static void vma_clear_pages(struct i915_vma *vma)
>   	vma->pages = NULL;
>   }
>   
> -static int vma_bind(struct i915_address_space *vm,
> -		    struct i915_vma *vma,
> -		    enum i915_cache_level cache_level,
> -		    u32 flags)
> +static void vma_bind(struct i915_address_space *vm,
> +		     struct i915_vm_pt_stash *stash,
> +		     struct i915_vma *vma,
> +		     enum i915_cache_level cache_level,
> +		     u32 flags)
>   {
> -	return vm->vma_ops.bind_vma(vm, vma, cache_level, flags);
> +	vm->vma_ops.bind_vma(vm, stash, vma, cache_level, flags);
>   }
>   
>   static void vma_unbind(struct i915_address_space *vm, struct i915_vma *vma)
> diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
> index 05497b50103f..35e2b698f9ed 100644
> --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
> +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
> @@ -177,16 +177,16 @@ static void gen6_flush_pd(struct gen6_ppgtt *ppgtt, u64 start, u64 end)
>   	mutex_unlock(&ppgtt->flush);
>   }
>   
> -static int gen6_alloc_va_range(struct i915_address_space *vm,
> -			       u64 start, u64 length)
> +static void gen6_alloc_va_range(struct i915_address_space *vm,
> +				struct i915_vm_pt_stash *stash,
> +				u64 start, u64 length)
>   {
>   	struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
>   	struct i915_page_directory * const pd = ppgtt->base.pd;
> -	struct i915_page_table *pt, *alloc = NULL;
> +	struct i915_page_table *pt;
>   	intel_wakeref_t wakeref;
>   	u64 from = start;
>   	unsigned int pde;
> -	int ret = 0;
>   
>   	wakeref = intel_runtime_pm_get(&vm->i915->runtime_pm);
>   
> @@ -197,21 +197,17 @@ static int gen6_alloc_va_range(struct i915_address_space *vm,
>   		if (px_base(pt) == px_base(&vm->scratch[1])) {
>   			spin_unlock(&pd->lock);
>   
> -			pt = fetch_and_zero(&alloc);
> -			if (!pt)
> -				pt = alloc_pt(vm);
> -			if (IS_ERR(pt)) {
> -				ret = PTR_ERR(pt);
> -				goto unwind_out;
> -			}
> +			pt = stash->pt[0];
> +			GEM_BUG_ON(!pt);
>   
>   			fill32_px(pt, vm->scratch[0].encode);
>   
>   			spin_lock(&pd->lock);
>   			if (pd->entry[pde] == &vm->scratch[1]) {
> +				stash->pt[0] = pt->stash;
> +				atomic_set(&pt->used, 0);
>   				pd->entry[pde] = pt;
>   			} else {
> -				alloc = pt;
>   				pt = pd->entry[pde];
>   			}
>   		}
> @@ -223,15 +219,7 @@ static int gen6_alloc_va_range(struct i915_address_space *vm,
>   	if (i915_vma_is_bound(ppgtt->vma, I915_VMA_GLOBAL_BIND))
>   		gen6_flush_pd(ppgtt, from, start);
>   
> -	goto out;
> -
> -unwind_out:
> -	gen6_ppgtt_clear_range(vm, from, start - from);
> -out:
> -	if (alloc)
> -		free_px(vm, alloc);
>   	intel_runtime_pm_put(&vm->i915->runtime_pm, wakeref);
> -	return ret;
>   }
>   
>   static int gen6_ppgtt_init_scratch(struct gen6_ppgtt *ppgtt)
> @@ -299,10 +287,11 @@ static void pd_vma_clear_pages(struct i915_vma *vma)
>   	vma->pages = NULL;
>   }
>   
> -static int pd_vma_bind(struct i915_address_space *vm,
> -		       struct i915_vma *vma,
> -		       enum i915_cache_level cache_level,
> -		       u32 unused)
> +static void pd_vma_bind(struct i915_address_space *vm,
> +			struct i915_vm_pt_stash *stash,
> +			struct i915_vma *vma,
> +			enum i915_cache_level cache_level,
> +			u32 unused)
>   {
>   	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
>   	struct gen6_ppgtt *ppgtt = vma->private;
> @@ -312,7 +301,6 @@ static int pd_vma_bind(struct i915_address_space *vm,
>   	ppgtt->pd_addr = (gen6_pte_t __iomem *)ggtt->gsm + ggtt_offset;
>   
>   	gen6_flush_pd(ppgtt, 0, ppgtt->base.vm.total);
> -	return 0;
>   }
>   
>   static void pd_vma_unbind(struct i915_address_space *vm, struct i915_vma *vma)
> diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
> index 699125928272..e6f2acd445dd 100644
> --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
> +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
> @@ -269,14 +269,12 @@ static void gen8_ppgtt_clear(struct i915_address_space *vm,
>   			   start, start + length, vm->top);
>   }
>   
> -static int __gen8_ppgtt_alloc(struct i915_address_space * const vm,
> -			      struct i915_page_directory * const pd,
> -			      u64 * const start, const u64 end, int lvl)
> +static void __gen8_ppgtt_alloc(struct i915_address_space * const vm,
> +			       struct i915_vm_pt_stash *stash,
> +			       struct i915_page_directory * const pd,
> +			       u64 * const start, const u64 end, int lvl)
>   {
> -	const struct i915_page_scratch * const scratch = &vm->scratch[lvl];
> -	struct i915_page_table *alloc = NULL;
>   	unsigned int idx, len;
> -	int ret = 0;
>   
>   	GEM_BUG_ON(end > vm->total >> GEN8_PTE_SHIFT);
>   
> @@ -297,49 +295,30 @@ static int __gen8_ppgtt_alloc(struct i915_address_space * const vm,
>   			DBG("%s(%p):{ lvl:%d, idx:%d } allocating new tree\n",
>   			    __func__, vm, lvl + 1, idx);
>   
> -			pt = fetch_and_zero(&alloc);
> -			if (lvl) {
> -				if (!pt) {
> -					pt = &alloc_pd(vm)->pt;
> -					if (IS_ERR(pt)) {
> -						ret = PTR_ERR(pt);
> -						goto out;
> -					}
> -				}
> +			pt = stash->pt[!!lvl];
> +			GEM_BUG_ON(!pt);
>   
> +			if (lvl ||
> +			    gen8_pt_count(*start, end) < I915_PDES ||
> +			    intel_vgpu_active(vm->i915))
>   				fill_px(pt, vm->scratch[lvl].encode);
> -			} else {
> -				if (!pt) {
> -					pt = alloc_pt(vm);
> -					if (IS_ERR(pt)) {
> -						ret = PTR_ERR(pt);
> -						goto out;
> -					}
> -				}
> -
> -				if (intel_vgpu_active(vm->i915) ||
> -				    gen8_pt_count(*start, end) < I915_PDES)
> -					fill_px(pt, vm->scratch[lvl].encode);
> -			}
>   
>   			spin_lock(&pd->lock);
> -			if (likely(!pd->entry[idx]))
> +			if (likely(!pd->entry[idx])) {
> +				stash->pt[!!lvl] = pt->stash;
> +				atomic_set(&pt->used, 0);
>   				set_pd_entry(pd, idx, pt);
> -			else
> -				alloc = pt, pt = pd->entry[idx];
> +			} else {
> +				pt = pd->entry[idx];
> +			}
>   		}
>   
>   		if (lvl) {
>   			atomic_inc(&pt->used);
>   			spin_unlock(&pd->lock);
>   
> -			ret = __gen8_ppgtt_alloc(vm, as_pd(pt),
> -						 start, end, lvl);
> -			if (unlikely(ret)) {
> -				if (release_pd_entry(pd, idx, pt, scratch))
> -					free_px(vm, pt);
> -				goto out;
> -			}
> +			__gen8_ppgtt_alloc(vm, stash,
> +					   as_pd(pt), start, end, lvl);
>   
>   			spin_lock(&pd->lock);
>   			atomic_dec(&pt->used);
> @@ -359,18 +338,12 @@ static int __gen8_ppgtt_alloc(struct i915_address_space * const vm,
>   		}
>   	} while (idx++, --len);
>   	spin_unlock(&pd->lock);
> -out:
> -	if (alloc)
> -		free_px(vm, alloc);
> -	return ret;
>   }
>   
> -static int gen8_ppgtt_alloc(struct i915_address_space *vm,
> -			    u64 start, u64 length)
> +static void gen8_ppgtt_alloc(struct i915_address_space *vm,
> +			     struct i915_vm_pt_stash *stash,
> +			     u64 start, u64 length)
>   {
> -	u64 from;
> -	int err;
> -
>   	GEM_BUG_ON(!IS_ALIGNED(start, BIT_ULL(GEN8_PTE_SHIFT)));
>   	GEM_BUG_ON(!IS_ALIGNED(length, BIT_ULL(GEN8_PTE_SHIFT)));
>   	GEM_BUG_ON(range_overflows(start, length, vm->total));
> @@ -378,15 +351,9 @@ static int gen8_ppgtt_alloc(struct i915_address_space *vm,
>   	start >>= GEN8_PTE_SHIFT;
>   	length >>= GEN8_PTE_SHIFT;
>   	GEM_BUG_ON(length == 0);
> -	from = start;
> -
> -	err = __gen8_ppgtt_alloc(vm, i915_vm_to_ppgtt(vm)->pd,
> -				 &start, start + length, vm->top);
> -	if (unlikely(err && from != start))
> -		__gen8_ppgtt_clear(vm, i915_vm_to_ppgtt(vm)->pd,
> -				   from, start, vm->top);
>   
> -	return err;
> +	__gen8_ppgtt_alloc(vm, stash, i915_vm_to_ppgtt(vm)->pd,
> +			   &start, start + length, vm->top);
>   }
>   
>   static __always_inline void
> diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c
> index 62979ea591f0..791e4070ef31 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
> @@ -436,16 +436,17 @@ static void i915_ggtt_clear_range(struct i915_address_space *vm,
>   	intel_gtt_clear_range(start >> PAGE_SHIFT, length >> PAGE_SHIFT);
>   }
>   
> -static int ggtt_bind_vma(struct i915_address_space *vm,
> -			 struct i915_vma *vma,
> -			 enum i915_cache_level cache_level,
> -			 u32 flags)
> +static void ggtt_bind_vma(struct i915_address_space *vm,
> +			  struct i915_vm_pt_stash *stash,
> +			  struct i915_vma *vma,
> +			  enum i915_cache_level cache_level,
> +			  u32 flags)
>   {
>   	struct drm_i915_gem_object *obj = vma->obj;
>   	u32 pte_flags;
>   
>   	if (i915_vma_is_bound(vma, ~flags & I915_VMA_BIND_MASK))
> -		return 0;
> +		return;
>   
>   	/* Applicable to VLV (gen8+ do not support RO in the GGTT) */
>   	pte_flags = 0;
> @@ -454,8 +455,6 @@ static int ggtt_bind_vma(struct i915_address_space *vm,
>   
>   	vm->insert_entries(vm, vma, cache_level, pte_flags);
>   	vma->page_sizes.gtt = I915_GTT_PAGE_SIZE;
> -
> -	return 0;
>   }
>   
>   static void ggtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma)
> @@ -568,31 +567,25 @@ static int init_ggtt(struct i915_ggtt *ggtt)
>   	return ret;
>   }
>   
> -static int aliasing_gtt_bind_vma(struct i915_address_space *vm,
> -				 struct i915_vma *vma,
> -				 enum i915_cache_level cache_level,
> -				 u32 flags)
> +static void aliasing_gtt_bind_vma(struct i915_address_space *vm,
> +				  struct i915_vm_pt_stash *stash,
> +				  struct i915_vma *vma,
> +				  enum i915_cache_level cache_level,
> +				  u32 flags)
>   {
>   	u32 pte_flags;
> -	int ret;
>   
>   	/* Currently applicable only to VLV */
>   	pte_flags = 0;
>   	if (i915_gem_object_is_readonly(vma->obj))
>   		pte_flags |= PTE_READ_ONLY;
>   
> -	if (flags & I915_VMA_LOCAL_BIND) {
> -		struct i915_ppgtt *alias = i915_vm_to_ggtt(vm)->alias;
> -
> -		ret = ppgtt_bind_vma(&alias->vm, vma, cache_level, flags);
> -		if (ret)
> -			return ret;
> -	}
> +	if (flags & I915_VMA_LOCAL_BIND)
> +		ppgtt_bind_vma(&i915_vm_to_ggtt(vm)->alias->vm,
> +			       stash, vma, cache_level, flags);
>   
>   	if (flags & I915_VMA_GLOBAL_BIND)
>   		vm->insert_entries(vm, vma, cache_level, pte_flags);
> -
> -	return 0;
>   }
>   
>   static void aliasing_gtt_unbind_vma(struct i915_address_space *vm,
> @@ -607,6 +600,7 @@ static void aliasing_gtt_unbind_vma(struct i915_address_space *vm,
>   
>   static int init_aliasing_ppgtt(struct i915_ggtt *ggtt)
>   {
> +	struct i915_vm_pt_stash stash = {};
>   	struct i915_ppgtt *ppgtt;
>   	int err;
>   
> @@ -619,15 +613,17 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt)
>   		goto err_ppgtt;
>   	}
>   
> +	err = i915_vm_alloc_pt_stash(&ppgtt->vm, &stash, ggtt->vm.total);
> +	if (err)
> +		goto err_ppgtt;
> +
>   	/*
>   	 * Note we only pre-allocate as far as the end of the global
>   	 * GTT. On 48b / 4-level page-tables, the difference is very,
>   	 * very significant! We have to preallocate as GVT/vgpu does
>   	 * not like the page directory disappearing.
>   	 */
> -	err = ppgtt->vm.allocate_va_range(&ppgtt->vm, 0, ggtt->vm.total);
> -	if (err)
> -		goto err_ppgtt;
> +	ppgtt->vm.allocate_va_range(&ppgtt->vm, &stash, 0, ggtt->vm.total);
>   
>   	ggtt->alias = ppgtt;
>   	ggtt->vm.bind_async_flags |= ppgtt->vm.bind_async_flags;
> @@ -638,6 +634,7 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt)
>   	GEM_BUG_ON(ggtt->vm.vma_ops.unbind_vma != ggtt_unbind_vma);
>   	ggtt->vm.vma_ops.unbind_vma = aliasing_gtt_unbind_vma;
>   
> +	i915_vm_free_pt_stash(&ppgtt->vm, &stash);
>   	return 0;
>   
>   err_ppgtt:
> diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h
> index f2b75078e05f..8bd462d2fcd9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gtt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
> @@ -159,7 +159,10 @@ struct i915_page_scratch {
>   
>   struct i915_page_table {
>   	struct i915_page_dma base;
> -	atomic_t used;
> +	union {
> +		atomic_t used;
> +		struct i915_page_table *stash;

What it is for?

> +	};
>   };
>   
>   struct i915_page_directory {
> @@ -196,12 +199,18 @@ struct drm_i915_gem_object;
>   struct i915_vma;
>   struct intel_gt;
>   
> +struct i915_vm_pt_stash {
> +	/* preallocated chains of page tables/directories */
> +	struct i915_page_table *pt[2];

How does the chain work, so I don't have to reverse engineer it from the 
code?

> +};
> +
>   struct i915_vma_ops {
>   	/* Map an object into an address space with the given cache flags. */
> -	int (*bind_vma)(struct i915_address_space *vm,
> -			struct i915_vma *vma,
> -			enum i915_cache_level cache_level,
> -			u32 flags);
> +	void (*bind_vma)(struct i915_address_space *vm,
> +			 struct i915_vm_pt_stash *stash,
> +			 struct i915_vma *vma,
> +			 enum i915_cache_level cache_level,
> +			 u32 flags);
>   	/*
>   	 * Unmap an object from an address space. This usually consists of
>   	 * setting the valid PTE entries to a reserved scratch page.
> @@ -281,8 +290,9 @@ struct i915_address_space {
>   			  u32 flags); /* Create a valid PTE */
>   #define PTE_READ_ONLY	BIT(0)
>   
> -	int (*allocate_va_range)(struct i915_address_space *vm,
> -				 u64 start, u64 length);
> +	void (*allocate_va_range)(struct i915_address_space *vm,
> +				  struct i915_vm_pt_stash *stash,
> +				  u64 start, u64 length);
>   	void (*clear_range)(struct i915_address_space *vm,
>   			    u64 start, u64 length);
>   	void (*insert_page)(struct i915_address_space *vm,
> @@ -568,10 +578,11 @@ int ggtt_set_pages(struct i915_vma *vma);
>   int ppgtt_set_pages(struct i915_vma *vma);
>   void clear_pages(struct i915_vma *vma);
>   
> -int ppgtt_bind_vma(struct i915_address_space *vm,
> -		   struct i915_vma *vma,
> -		   enum i915_cache_level cache_level,
> -		   u32 flags);
> +void ppgtt_bind_vma(struct i915_address_space *vm,
> +		    struct i915_vm_pt_stash *stash,
> +		    struct i915_vma *vma,
> +		    enum i915_cache_level cache_level,
> +		    u32 flags);
>   void ppgtt_unbind_vma(struct i915_address_space *vm,
>   		      struct i915_vma *vma);
>   
> @@ -579,6 +590,12 @@ void gtt_write_workarounds(struct intel_gt *gt);
>   
>   void setup_private_pat(struct intel_uncore *uncore);
>   
> +int i915_vm_alloc_pt_stash(struct i915_address_space *vm,
> +			   struct i915_vm_pt_stash *stash,
> +			   u64 size);
> +void i915_vm_free_pt_stash(struct i915_address_space *vm,
> +			   struct i915_vm_pt_stash *stash);
> +
>   static inline struct sgt_dma {
>   	struct scatterlist *sg;
>   	dma_addr_t dma, max;
> diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> index f0862e924d11..9633fd2d294d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> @@ -155,19 +155,16 @@ struct i915_ppgtt *i915_ppgtt_create(struct intel_gt *gt)
>   	return ppgtt;
>   }
>   
> -int ppgtt_bind_vma(struct i915_address_space *vm,
> -		   struct i915_vma *vma,
> -		   enum i915_cache_level cache_level,
> -		   u32 flags)
> +void ppgtt_bind_vma(struct i915_address_space *vm,
> +		    struct i915_vm_pt_stash *stash,
> +		    struct i915_vma *vma,
> +		    enum i915_cache_level cache_level,
> +		    u32 flags)
>   {
>   	u32 pte_flags;
> -	int err;
>   
>   	if (!test_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma))) {
> -		err = vm->allocate_va_range(vm, vma->node.start, vma->size);
> -		if (err)
> -			return err;
> -
> +		vm->allocate_va_range(vm, stash, vma->node.start, vma->size);
>   		set_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma));
>   	}
>   
> @@ -178,8 +175,6 @@ int ppgtt_bind_vma(struct i915_address_space *vm,
>   
>   	vm->insert_entries(vm, vma, cache_level, pte_flags);
>   	wmb();
> -
> -	return 0;
>   }
>   
>   void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma)
> @@ -188,12 +183,73 @@ void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma)
>   		vm->clear_range(vm, vma->node.start, vma->size);
>   }
>   
> +static unsigned long pd_count(u64 size, int shift)
> +{
> +	/* Beware later misalignment */
> +	return (size + 2 * (BIT_ULL(shift) - 1)) >> shift;

Beware how and what misalignment? :)

> +}
> +
> +int i915_vm_alloc_pt_stash(struct i915_address_space *vm,
> +			   struct i915_vm_pt_stash *stash,
> +			   u64 size)
> +{
> +	unsigned long count;
> +	int shift = 21;

I wanted to ask what is 21 (2 MiB?) but it probably is overall better if 
Matt or Mika reviewed this one.

Regards,

Tvrtko

> +	int n;
> +
> +	count = pd_count(size, shift);
> +	while (count--) {
> +		struct i915_page_table *pt;
> +
> +		pt = alloc_pt(vm);
> +		if (IS_ERR(pt)) {
> +			i915_vm_free_pt_stash(vm, stash);
> +			return PTR_ERR(pt);
> +		}
> +
> +		pt->stash = stash->pt[0];
> +		stash->pt[0] = pt;
> +	}
> +
> +	for (n = 1; n < vm->top; n++) {
> +		shift += 9;
> +		count = pd_count(size, shift);
> +		while (count--) {
> +			struct i915_page_directory *pd;
> +
> +			pd = alloc_pd(vm);
> +			if (IS_ERR(pd)) {
> +				i915_vm_free_pt_stash(vm, stash);
> +				return PTR_ERR(pd);
> +			}
> +
> +			pd->pt.stash = stash->pt[1];
> +			stash->pt[1] = &pd->pt;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +void i915_vm_free_pt_stash(struct i915_address_space *vm,
> +			   struct i915_vm_pt_stash *stash)
> +{
> +	struct i915_page_table *pt;
> +	int n;
> +
> +	for (n = 0; n < ARRAY_SIZE(stash->pt); n++) {
> +		while ((pt = stash->pt[n])) {
> +			stash->pt[n] = pt->stash;
> +			free_px(vm, pt);
> +		}
> +	}
> +}
> +
>   int ppgtt_set_pages(struct i915_vma *vma)
>   {
>   	GEM_BUG_ON(vma->pages);
>   
>   	vma->pages = vma->obj->mm.pages;
> -
>   	vma->page_sizes = vma->obj->mm.page_sizes;
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index 627bac2e0252..fc8a083753bd 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -295,6 +295,8 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
>   
>   struct i915_vma_work {
>   	struct dma_fence_work base;
> +	struct i915_address_space *vm;
> +	struct i915_vm_pt_stash stash;
>   	struct i915_vma *vma;
>   	struct drm_i915_gem_object *pinned;
>   	struct i915_sw_dma_fence_cb cb;
> @@ -306,13 +308,10 @@ static int __vma_bind(struct dma_fence_work *work)
>   {
>   	struct i915_vma_work *vw = container_of(work, typeof(*vw), base);
>   	struct i915_vma *vma = vw->vma;
> -	int err;
> -
> -	err = vma->ops->bind_vma(vma->vm, vma, vw->cache_level, vw->flags);
> -	if (err)
> -		atomic_or(I915_VMA_ERROR, &vma->flags);
>   
> -	return err;
> +	vma->ops->bind_vma(vw->vm, &vw->stash,
> +			   vma, vw->cache_level, vw->flags);
> +	return 0;
>   }
>   
>   static void __vma_release(struct dma_fence_work *work)
> @@ -321,6 +320,9 @@ static void __vma_release(struct dma_fence_work *work)
>   
>   	if (vw->pinned)
>   		__i915_gem_object_unpin_pages(vw->pinned);
> +
> +	i915_vm_free_pt_stash(vw->vm, &vw->stash);
> +	i915_vm_put(vw->vm);
>   }
>   
>   static const struct dma_fence_work_ops bind_ops = {
> @@ -380,7 +382,6 @@ int i915_vma_bind(struct i915_vma *vma,
>   {
>   	u32 bind_flags;
>   	u32 vma_flags;
> -	int ret;
>   
>   	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
>   	GEM_BUG_ON(vma->size > vma->node.size);
> @@ -437,9 +438,7 @@ int i915_vma_bind(struct i915_vma *vma,
>   			work->pinned = vma->obj;
>   		}
>   	} else {
> -		ret = vma->ops->bind_vma(vma->vm, vma, cache_level, bind_flags);
> -		if (ret)
> -			return ret;
> +		vma->ops->bind_vma(vma->vm, NULL, vma, cache_level, bind_flags);
>   	}
>   
>   	atomic_or(bind_flags, &vma->flags);
> @@ -878,11 +877,21 @@ int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
>   		return err;
>   
>   	if (flags & vma->vm->bind_async_flags) {
> +		u64 max_size;
> +
>   		work = i915_vma_work();
>   		if (!work) {
>   			err = -ENOMEM;
>   			goto err_pages;
>   		}
> +
> +		work->vm = i915_vm_get(vma->vm);
> +
> +		/* Allocate enough page directories to cover worst case */
> +		max_size = max(size, vma->size);
> +		if (flags & PIN_MAPPABLE)
> +			max_size = max_t(u64, max_size, vma->fence_size);
> +		i915_vm_alloc_pt_stash(vma->vm, &work->stash, max_size);
>   	}
>   
>   	if (flags & PIN_GLOBAL)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> index 0016ffc7d914..9b8fc990e9ef 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> @@ -172,35 +172,33 @@ static int igt_ppgtt_alloc(void *arg)
>   
>   	/* Check we can allocate the entire range */
>   	for (size = 4096; size <= limit; size <<= 2) {
> -		err = ppgtt->vm.allocate_va_range(&ppgtt->vm, 0, size);
> -		if (err) {
> -			if (err == -ENOMEM) {
> -				pr_info("[1] Ran out of memory for va_range [0 + %llx] [bit %d]\n",
> -					size, ilog2(size));
> -				err = 0; /* virtual space too large! */
> -			}
> +		struct i915_vm_pt_stash stash = {};
> +
> +		err = i915_vm_alloc_pt_stash(&ppgtt->vm, &stash, size);
> +		if (err)
>   			goto err_ppgtt_cleanup;
> -		}
>   
> +		ppgtt->vm.allocate_va_range(&ppgtt->vm, &stash, 0, size);
>   		cond_resched();
>   
>   		ppgtt->vm.clear_range(&ppgtt->vm, 0, size);
> +
> +		i915_vm_free_pt_stash(&ppgtt->vm, &stash);
>   	}
>   
>   	/* Check we can incrementally allocate the entire range */
>   	for (last = 0, size = 4096; size <= limit; last = size, size <<= 2) {
> -		err = ppgtt->vm.allocate_va_range(&ppgtt->vm,
> -						  last, size - last);
> -		if (err) {
> -			if (err == -ENOMEM) {
> -				pr_info("[2] Ran out of memory for va_range [%llx + %llx] [bit %d]\n",
> -					last, size - last, ilog2(size));
> -				err = 0; /* virtual space too large! */
> -			}
> +		struct i915_vm_pt_stash stash = {};
> +
> +		err = i915_vm_alloc_pt_stash(&ppgtt->vm, &stash, size - last);
> +		if (err)
>   			goto err_ppgtt_cleanup;
> -		}
>   
> +		ppgtt->vm.allocate_va_range(&ppgtt->vm, &stash,
> +					    last, size - last);
>   		cond_resched();
> +
> +		i915_vm_free_pt_stash(&ppgtt->vm, &stash);
>   	}
>   
>   err_ppgtt_cleanup:
> @@ -284,9 +282,18 @@ static int lowlevel_hole(struct i915_address_space *vm,
>   				break;
>   			}
>   
> -			if (vm->allocate_va_range &&
> -			    vm->allocate_va_range(vm, addr, BIT_ULL(size)))
> -				break;
> +			if (vm->allocate_va_range) {
> +				struct i915_vm_pt_stash stash = {};
> +
> +				if (i915_vm_alloc_pt_stash(vm, &stash,
> +							   BIT_ULL(size)))
> +					break;
> +
> +				vm->allocate_va_range(vm, &stash,
> +						      addr, BIT_ULL(size));
> +
> +				i915_vm_free_pt_stash(vm, &stash);
> +			}
>   
>   			mock_vma->pages = obj->mm.pages;
>   			mock_vma->node.size = BIT_ULL(size);
> @@ -1881,6 +1888,7 @@ static int igt_cs_tlb(void *arg)
>   			continue;
>   
>   		while (!__igt_timeout(end_time, NULL)) {
> +			struct i915_vm_pt_stash stash = {};
>   			struct i915_request *rq;
>   			u64 offset;
>   
> @@ -1888,10 +1896,6 @@ static int igt_cs_tlb(void *arg)
>   						   0, vm->total - PAGE_SIZE,
>   						   chunk_size, PAGE_SIZE);
>   
> -			err = vm->allocate_va_range(vm, offset, chunk_size);
> -			if (err)
> -				goto end;
> -
>   			memset32(result, STACK_MAGIC, PAGE_SIZE / sizeof(u32));
>   
>   			vma = i915_vma_instance(bbe, vm, NULL);
> @@ -1904,6 +1908,14 @@ static int igt_cs_tlb(void *arg)
>   			if (err)
>   				goto end;
>   
> +			err = i915_vm_alloc_pt_stash(vm, &stash, chunk_size);
> +			if (err)
> +				goto end;
> +
> +			vm->allocate_va_range(vm, &stash, offset, chunk_size);
> +
> +			i915_vm_free_pt_stash(vm, &stash);
> +
>   			/* Prime the TLB with the dummy pages */
>   			for (i = 0; i < count; i++) {
>   				vma->node.start = offset + i * PAGE_SIZE;
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gtt.c b/drivers/gpu/drm/i915/selftests/mock_gtt.c
> index b173086411ef..5e4fb0fba34b 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gtt.c
> @@ -38,14 +38,14 @@ static void mock_insert_entries(struct i915_address_space *vm,
>   {
>   }
>   
> -static int mock_bind_ppgtt(struct i915_address_space *vm,
> -			   struct i915_vma *vma,
> -			   enum i915_cache_level cache_level,
> -			   u32 flags)
> +static void mock_bind_ppgtt(struct i915_address_space *vm,
> +			    struct i915_vm_pt_stash *stash,
> +			    struct i915_vma *vma,
> +			    enum i915_cache_level cache_level,
> +			    u32 flags)
>   {
>   	GEM_BUG_ON(flags & I915_VMA_GLOBAL_BIND);
>   	set_bit(I915_VMA_LOCAL_BIND_BIT, __i915_vma_flags(vma));
> -	return 0;
>   }
>   
>   static void mock_unbind_ppgtt(struct i915_address_space *vm,
> @@ -74,6 +74,7 @@ struct i915_ppgtt *mock_ppgtt(struct drm_i915_private *i915, const char *name)
>   	ppgtt->vm.i915 = i915;
>   	ppgtt->vm.total = round_down(U64_MAX, PAGE_SIZE);
>   	ppgtt->vm.file = ERR_PTR(-ENODEV);
> +	ppgtt->vm.dma = &i915->drm.pdev->dev;
>   
>   	i915_address_space_init(&ppgtt->vm, VM_CLASS_PPGTT);
>   
> @@ -90,13 +91,12 @@ struct i915_ppgtt *mock_ppgtt(struct drm_i915_private *i915, const char *name)
>   	return ppgtt;
>   }
>   
> -static int mock_bind_ggtt(struct i915_address_space *vm,
> -			  struct i915_vma *vma,
> -			  enum i915_cache_level cache_level,
> -			  u32 flags)
> +static void mock_bind_ggtt(struct i915_address_space *vm,
> +			   struct i915_vm_pt_stash *stash,
> +			   struct i915_vma *vma,
> +			   enum i915_cache_level cache_level,
> +			   u32 flags)
>   {
> -	atomic_or(I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND, &vma->flags);
> -	return 0;
>   }
>   
>   static void mock_unbind_ggtt(struct i915_address_space *vm,
>