[Intel-gfx] [PATCH] drm/i915: Refine VT-d scanout workaround
Matthew Auld
matthew.auld at intel.com
Thu Feb 11 17:00:20 UTC 2021
On 11/02/2021 15:19, Chris Wilson wrote:
> Quoting Matthew Auld (2021-02-11 14:25:41)
>> On 10/02/2021 23:39, Chris Wilson wrote:
>>> VT-d may cause overfetch of the scanout PTE, both before and after the
>>> vma (depending on the scanout orientation). bspec recommends that we
>>> provide a tile-row in either directions, and suggests using 160 PTE,
>>> warning that the accesses will wrap around the ends of the GGTT.
>>> Currently, we fill the entire GGTT with scratch pages when using VT-d to
>>> always ensure there are valid entries around every vma, including
>>> scanout. However, writing every PTE is slow as on recent devices we
>>> perform 8MiB of uncached writes, incurring an extra 100ms during resume.
>>>
>>> If instead we focus on only putting guard pages around scanout, we can
>>> avoid touching the whole GGTT. To avoid having to introduce extra nodes
>>> around each scanout vma, we adjust the scanout drm_mm_node to be smaller
>>> than the allocated space, and fixup the extra PTE during dma binding.
>>>
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
>>> Cc: Matthew Auld <matthew.auld at intel.com>
>>> ---
>>> drivers/gpu/drm/i915/gem/i915_gem_domain.c | 3 ++
>>> drivers/gpu/drm/i915/gt/intel_ggtt.c | 37 ++++++++--------------
>>> drivers/gpu/drm/i915/i915_gem_gtt.h | 1 +
>>> drivers/gpu/drm/i915/i915_vma.c | 23 ++++++++++++++
>>> drivers/gpu/drm/i915/i915_vma_types.h | 1 +
>>> 5 files changed, 41 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
>>> index 0478b069c202..9f2ccc255ca1 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
>>> @@ -345,6 +345,9 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
>>> if (ret)
>>> goto err;
>>>
>>> + if (intel_scanout_needs_vtd_wa(i915))
>>> + flags |= PIN_VTD;
>>> +
>>> /*
>>> * As the user may map the buffer once pinned in the display plane
>>> * (e.g. libkms for the bootup splash), we have to ensure that we
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c
>>> index b0b8ded834f0..416f77f48561 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
>>> @@ -238,6 +238,11 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm,
>>>
>>> gte = (gen8_pte_t __iomem *)ggtt->gsm;
>>> gte += vma->node.start / I915_GTT_PAGE_SIZE;
>>> +
>>> + end = gte - vma->guard / I915_GTT_PAGE_SIZE;
>>> + while (end < gte)
>>> + gen8_set_pte(end++, vm->scratch[0]->encode);
>>> +
>>> end = gte + vma->node.size / I915_GTT_PAGE_SIZE;
>>>
>>> for_each_sgt_daddr(addr, iter, vma->pages)
>>> @@ -245,6 +250,7 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm,
>>> GEM_BUG_ON(gte > end);
>>>
>>> /* Fill the allocated but "unused" space beyond the end of the buffer */
>>> + end += vma->guard / I915_GTT_PAGE_SIZE;
>>> while (gte < end)
>>> gen8_set_pte(gte++, vm->scratch[0]->encode);
>>>
>>> @@ -289,6 +295,11 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm,
>>>
>>> gte = (gen6_pte_t __iomem *)ggtt->gsm;
>>> gte += vma->node.start / I915_GTT_PAGE_SIZE;
>>> +
>>> + end = gte - vma->guard / I915_GTT_PAGE_SIZE;
>>> + while (end < gte)
>>> + gen8_set_pte(end++, vm->scratch[0]->encode);
>>> +
>>> end = gte + vma->node.size / I915_GTT_PAGE_SIZE;
>>>
>>> for_each_sgt_daddr(addr, iter, vma->pages)
>>> @@ -296,6 +307,7 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm,
>>> GEM_BUG_ON(gte > end);
>>>
>>> /* Fill the allocated but "unused" space beyond the end of the buffer */
>>> + end += vma->guard / I915_GTT_PAGE_SIZE;
>>> while (gte < end)
>>> iowrite32(vm->scratch[0]->encode, gte++);
>>>
>>> @@ -311,27 +323,6 @@ static void nop_clear_range(struct i915_address_space *vm,
>>> {
>>> }
>>>
>>> -static void gen8_ggtt_clear_range(struct i915_address_space *vm,
>>> - u64 start, u64 length)
>>> -{
>>> - struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
>>> - unsigned int first_entry = start / I915_GTT_PAGE_SIZE;
>>> - unsigned int num_entries = length / I915_GTT_PAGE_SIZE;
>>> - const gen8_pte_t scratch_pte = vm->scratch[0]->encode;
>>> - gen8_pte_t __iomem *gtt_base =
>>> - (gen8_pte_t __iomem *)ggtt->gsm + first_entry;
>>> - const int max_entries = ggtt_total_entries(ggtt) - first_entry;
>>> - int i;
>>> -
>>> - if (WARN(num_entries > max_entries,
>>> - "First entry = %d; Num entries = %d (max=%d)\n",
>>> - first_entry, num_entries, max_entries))
>>> - num_entries = max_entries;
>>> -
>>> - for (i = 0; i < num_entries; i++)
>>> - gen8_set_pte(>t_base[i], scratch_pte);
>>> -}
>>> -
>>> static void bxt_vtd_ggtt_wa(struct i915_address_space *vm)
>>> {
>>> /*
>>> @@ -898,8 +889,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)
>>> ggtt->vm.cleanup = gen6_gmch_remove;
>>> ggtt->vm.insert_page = gen8_ggtt_insert_page;
>>> ggtt->vm.clear_range = nop_clear_range;
>>> - if (intel_scanout_needs_vtd_wa(i915))
>>> - ggtt->vm.clear_range = gen8_ggtt_clear_range;
>>>
>>> ggtt->vm.insert_entries = gen8_ggtt_insert_entries;
>>>
>>> @@ -1045,7 +1034,7 @@ static int gen6_gmch_probe(struct i915_ggtt *ggtt)
>>> ggtt->vm.alloc_pt_dma = alloc_pt_dma;
>>>
>>> ggtt->vm.clear_range = nop_clear_range;
>>> - if (!HAS_FULL_PPGTT(i915) || intel_scanout_needs_vtd_wa(i915))
>>> + if (!HAS_FULL_PPGTT(i915))
>>> ggtt->vm.clear_range = gen6_ggtt_clear_range;
>>> ggtt->vm.insert_page = gen6_ggtt_insert_page;
>>> ggtt->vm.insert_entries = gen6_ggtt_insert_entries;
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
>>> index c9b0ee5e1d23..8a2dfc7144cf 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>>> @@ -41,6 +41,7 @@ int i915_gem_gtt_insert(struct i915_address_space *vm,
>>> #define PIN_HIGH BIT_ULL(5)
>>> #define PIN_OFFSET_BIAS BIT_ULL(6)
>>> #define PIN_OFFSET_FIXED BIT_ULL(7)
>>> +#define PIN_VTD BIT_ULL(8)
>>>
>>> #define PIN_GLOBAL BIT_ULL(10) /* I915_VMA_GLOBAL_BIND */
>>> #define PIN_USER BIT_ULL(11) /* I915_VMA_LOCAL_BIND */
>>> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
>>> index caa9b041616b..dccd36ff1a6d 100644
>>> --- a/drivers/gpu/drm/i915/i915_vma.c
>>> +++ b/drivers/gpu/drm/i915/i915_vma.c
>>> @@ -38,6 +38,8 @@
>>> #include "i915_trace.h"
>>> #include "i915_vma.h"
>>>
>>> +#define VTD_GUARD roundup_pow_of_two(160 * SZ_4K) /* 160 PTE padding */
>>> +
>>> static struct i915_global_vma {
>>> struct i915_global base;
>>> struct kmem_cache *slab_vmas;
>>> @@ -552,6 +554,9 @@ bool i915_vma_misplaced(const struct i915_vma *vma,
>>> vma->node.start != (flags & PIN_OFFSET_MASK))
>>> return true;
>>>
>>> + if (flags & PIN_VTD && vma->guard < VTD_GUARD)
>>> + return true;
>>> +
>>> return false;
>>> }
>>>
>>> @@ -637,6 +642,13 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
>>> alignment, vma->fence_alignment);
>>> }
>>>
>>> + /* VT-d requires padding before/after the vma */
>>> + if (flags & PIN_VTD) {
>>> + alignment = max_t(typeof(alignment), alignment, VTD_GUARD);
>>> + vma->guard = alignment;
>>> + size += 2 * vma->guard;
>>> + }
>>> +
>>> GEM_BUG_ON(!IS_ALIGNED(size, I915_GTT_PAGE_SIZE));
>>> GEM_BUG_ON(!IS_ALIGNED(alignment, I915_GTT_MIN_ALIGNMENT));
>>> GEM_BUG_ON(!is_power_of_2(alignment));
>>> @@ -725,6 +737,11 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
>>>
>>> list_add_tail(&vma->vm_link, &vma->vm->bound_list);
>>>
>>> + if (flags & PIN_VTD) {
>>> + vma->node.start += vma->guard;
>>> + vma->node.size -= 2 * vma->guard;
>>> + }
>>> +
>>
>> So we adjust the node to be twice as large, reserve it in the mm, and
>> then shrink it again here. Seems a little scary to modify
>> node.start/node.size after insertion?
>
> Definitely scary. So far it feels like the most viable approach. At
> least the concept of including a red-zone around the vma/drm_mm_node as
> part of that structure seems useful, and familiar.
Throwing some color_adjust at it might be another option to consider.
Maybe something like:
+static void i915_ggtt_color_adjust_vdt(const struct drm_mm_node *node,
+ unsigned long color,
+ u64 *start,
+ u64 *end)
+{
+ if (color == COLOR_VDT) {
+ *start += VDT_PADDING;
+ *end -= VDT_PADDING;
+ return;
+ }
+
+ if (node->allocated && node->color == COLOR_VDT)
+ *start += VDT_PADDING;
+
+ node = list_next_entry(node, node_list);
+ if (node->allocated && node->color == COLOR_VDT)
+ *end -= VDT_PADDING;
+}
But not sure if I would call that simpler. Plus we need to add more
special casing in the eviction code. And I guess the cache coloring
might also be active here, which might be nasty. Also we end up with a
bunch of holes in the address space that are unusable, yet the mm search
will keep hitting them anyway. Ok, I guess that's what you meant with
not introducing "extra nodes". Hmmm.
>
>> Just wondering if this might upset something like evict_for_node, where
>> we inspect the allocated nodes over some range, rather than holes, and
>> since we shrunk the node it might get confused into thinking the padding
>> is actually free space. Or maybe that doesn't really matter much for the
>> GGTT?
>
> The stumbling block I had earlier then forgot about is the drm_mm_node's
> computed hole_size uses the adjacent drm_mm_node.start. So as we've
> adjusted the node.start, it will later on declare a bigger hole and
> reuse part of our guard pages. That we will then clobber on resume.
>
> (There's also consequences for the address interval tree, but that will
> continue to work correctly as we won't move the node.start beyond the
> allocated space.)
>
> Hmm. Fortunately, we have wrapped all users of vma->node.start, at least
> for ggtt, with i915_ggtt_offset(). So it is feasible to adjust the
> offset by vma->guard there.
>
> Let's try that.
> -Chris
>
More information about the Intel-gfx
mailing list