[Intel-gfx] [PATCH v4 4/5] drm/i915: Refine VT-d scanout workaround

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Thu Dec 1 09:00:33 UTC 2022


On 30/11/2022 23:58, Andi Shyti wrote:
> From: Chris Wilson <chris at chris-wilson.co.uk>
> 
> VT-d may cause overfetch of the scanout PTE, both before and after the
> vma (depending on the scanout orientation). bspec recommends that we
> provide a tile-row in either directions, and suggests using 168 PTE,
> warning that the accesses will wrap around the ends of the GGTT.
> Currently, we fill the entire GGTT with scratch pages when using VT-d to
> always ensure there are valid entries around every vma, including
> scanout. However, writing every PTE is slow as on recent devices we
> perform 8MiB of uncached writes, incurring an extra 100ms during resume.
> 
> If instead we focus on only putting guard pages around scanout, we can
> avoid touching the whole GGTT. To avoid having to introduce extra nodes
> around each scanout vma, we adjust the scanout drm_mm_node to be smaller
> than the allocated space, and fixup the extra PTE during dma binding.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Signed-off-by: Tejas Upadhyay <tejaskumarx.surendrakumar.upadhyay at intel.com>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Signed-off-by: Andi Shyti <andi.shyti at linux.intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_domain.c | 13 +++++++++++
>   drivers/gpu/drm/i915/gt/intel_ggtt.c       | 25 +---------------------
>   2 files changed, 14 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
> index 850776a783ac7..9969e687ad857 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
> @@ -17,6 +17,8 @@
>   #include "i915_gem_object.h"
>   #include "i915_vma.h"
>   
> +#define VTD_GUARD (168u * I915_GTT_PAGE_SIZE) /* 168 or tile-row PTE padding */
> +
>   static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
>   {
>   	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> @@ -424,6 +426,17 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
>   	if (ret)
>   		return ERR_PTR(ret);
>   
> +	/* VT-d may overfetch before/after the vma, so pad with scratch */
> +	if (intel_scanout_needs_vtd_wa(i915)) {
> +		unsigned int guard = VTD_GUARD;
> +
> +		if (i915_gem_object_is_tiled(obj))
> +			guard = max(guard,
> +				    i915_gem_object_get_tile_row_size(obj));
> +
> +		flags |= PIN_OFFSET_GUARD | guard;
> +	}
> +
>   	/*
>   	 * As the user may map the buffer once pinned in the display plane
>   	 * (e.g. libkms for the bootup splash), we have to ensure that we
> diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c
> index 784d4a8c43ba9..fa96d925c2596 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
> @@ -376,27 +376,6 @@ static void nop_clear_range(struct i915_address_space *vm,
>   {
>   }
>   
> -static void gen8_ggtt_clear_range(struct i915_address_space *vm,
> -				  u64 start, u64 length)
> -{
> -	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
> -	unsigned int first_entry = start / I915_GTT_PAGE_SIZE;
> -	unsigned int num_entries = length / I915_GTT_PAGE_SIZE;
> -	const gen8_pte_t scratch_pte = vm->scratch[0]->encode;
> -	gen8_pte_t __iomem *gtt_base =
> -		(gen8_pte_t __iomem *)ggtt->gsm + first_entry;
> -	const int max_entries = ggtt_total_entries(ggtt) - first_entry;
> -	int i;
> -
> -	if (WARN(num_entries > max_entries,
> -		 "First entry = %d; Num entries = %d (max=%d)\n",
> -		 first_entry, num_entries, max_entries))
> -		num_entries = max_entries;
> -
> -	for (i = 0; i < num_entries; i++)
> -		gen8_set_pte(&gtt_base[i], scratch_pte);
> -}
> -
>   static void bxt_vtd_ggtt_wa(struct i915_address_space *vm)
>   {
>   	/*
> @@ -968,8 +947,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)
>   	ggtt->vm.cleanup = gen6_gmch_remove;
>   	ggtt->vm.insert_page = gen8_ggtt_insert_page;
>   	ggtt->vm.clear_range = nop_clear_range;
> -	if (intel_scanout_needs_vtd_wa(i915))
> -		ggtt->vm.clear_range = gen8_ggtt_clear_range;
>   
>   	ggtt->vm.insert_entries = gen8_ggtt_insert_entries;
>   
> @@ -1128,7 +1105,7 @@ static int gen6_gmch_probe(struct i915_ggtt *ggtt)
>   	ggtt->vm.alloc_scratch_dma = alloc_pt_dma;
>   
>   	ggtt->vm.clear_range = nop_clear_range;
> -	if (!HAS_FULL_PPGTT(i915) || intel_scanout_needs_vtd_wa(i915))
> +	if (!HAS_FULL_PPGTT(i915))
>   		ggtt->vm.clear_range = gen6_ggtt_clear_range;
>   	ggtt->vm.insert_page = gen6_ggtt_insert_page;
>   	ggtt->vm.insert_entries = gen6_ggtt_insert_entries;

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko


More information about the Intel-gfx mailing list