[Intel-gfx] [PATCH] drm/i915: Refine VT-d scanout workaround

Thu Feb 11 17:00:20 UTC 2021

On 11/02/2021 15:19, Chris Wilson wrote:
> Quoting Matthew Auld (2021-02-11 14:25:41)
>> On 10/02/2021 23:39, Chris Wilson wrote:
>>> VT-d may cause overfetch of the scanout PTE, both before and after the
>>> vma (depending on the scanout orientation). bspec recommends that we
>>> provide a tile-row in either directions, and suggests using 160 PTE,
>>> warning that the accesses will wrap around the ends of the GGTT.
>>> Currently, we fill the entire GGTT with scratch pages when using VT-d to
>>> always ensure there are valid entries around every vma, including
>>> scanout. However, writing every PTE is slow as on recent devices we
>>> perform 8MiB of uncached writes, incurring an extra 100ms during resume.
>>>
>>> If instead we focus on only putting guard pages around scanout, we can
>>> avoid touching the whole GGTT. To avoid having to introduce extra nodes
>>> around each scanout vma, we adjust the scanout drm_mm_node to be smaller
>>> than the allocated space, and fixup the extra PTE during dma binding.
>>>
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
>>> Cc: Matthew Auld <matthew.auld at intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gem/i915_gem_domain.c |  3 ++
>>>    drivers/gpu/drm/i915/gt/intel_ggtt.c       | 37 ++++++++--------------
>>>    drivers/gpu/drm/i915/i915_gem_gtt.h        |  1 +
>>>    drivers/gpu/drm/i915/i915_vma.c            | 23 ++++++++++++++
>>>    drivers/gpu/drm/i915/i915_vma_types.h      |  1 +
>>>    5 files changed, 41 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
>>> index 0478b069c202..9f2ccc255ca1 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
>>> @@ -345,6 +345,9 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
>>>        if (ret)
>>>                goto err;
>>>    
>>> +     if (intel_scanout_needs_vtd_wa(i915))
>>> +             flags |= PIN_VTD;
>>> +
>>>        /*
>>>         * As the user may map the buffer once pinned in the display plane
>>>         * (e.g. libkms for the bootup splash), we have to ensure that we
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c
>>> index b0b8ded834f0..416f77f48561 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
>>> @@ -238,6 +238,11 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm,
>>>    
>>>        gte = (gen8_pte_t __iomem *)ggtt->gsm;
>>>        gte += vma->node.start / I915_GTT_PAGE_SIZE;
>>> +
>>> +     end = gte - vma->guard / I915_GTT_PAGE_SIZE;
>>> +     while (end < gte)
>>> +             gen8_set_pte(end++, vm->scratch[0]->encode);
>>> +
>>>        end = gte + vma->node.size / I915_GTT_PAGE_SIZE;
>>>    
>>>        for_each_sgt_daddr(addr, iter, vma->pages)
>>> @@ -245,6 +250,7 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm,
>>>        GEM_BUG_ON(gte > end);
>>>    
>>>        /* Fill the allocated but "unused" space beyond the end of the buffer */
>>> +     end += vma->guard / I915_GTT_PAGE_SIZE;
>>>        while (gte < end)
>>>                gen8_set_pte(gte++, vm->scratch[0]->encode);
>>>    
>>> @@ -289,6 +295,11 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm,
>>>    
>>>        gte = (gen6_pte_t __iomem *)ggtt->gsm;
>>>        gte += vma->node.start / I915_GTT_PAGE_SIZE;
>>> +
>>> +     end = gte - vma->guard / I915_GTT_PAGE_SIZE;
>>> +     while (end < gte)
>>> +             gen8_set_pte(end++, vm->scratch[0]->encode);
>>> +
>>>        end = gte + vma->node.size / I915_GTT_PAGE_SIZE;
>>>    
>>>        for_each_sgt_daddr(addr, iter, vma->pages)
>>> @@ -296,6 +307,7 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm,
>>>        GEM_BUG_ON(gte > end);
>>>    
>>>        /* Fill the allocated but "unused" space beyond the end of the buffer */
>>> +     end += vma->guard / I915_GTT_PAGE_SIZE;
>>>        while (gte < end)
>>>                iowrite32(vm->scratch[0]->encode, gte++);
>>>    
>>> @@ -311,27 +323,6 @@ static void nop_clear_range(struct i915_address_space *vm,
>>>    {
>>>    }
>>>    
>>> -static void gen8_ggtt_clear_range(struct i915_address_space *vm,
>>> -                               u64 start, u64 length)
>>> -{
>>> -     struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
>>> -     unsigned int first_entry = start / I915_GTT_PAGE_SIZE;
>>> -     unsigned int num_entries = length / I915_GTT_PAGE_SIZE;
>>> -     const gen8_pte_t scratch_pte = vm->scratch[0]->encode;
>>> -     gen8_pte_t __iomem *gtt_base =
>>> -             (gen8_pte_t __iomem *)ggtt->gsm + first_entry;
>>> -     const int max_entries = ggtt_total_entries(ggtt) - first_entry;
>>> -     int i;
>>> -
>>> -     if (WARN(num_entries > max_entries,
>>> -              "First entry = %d; Num entries = %d (max=%d)\n",
>>> -              first_entry, num_entries, max_entries))
>>> -             num_entries = max_entries;
>>> -
>>> -     for (i = 0; i < num_entries; i++)
>>> -             gen8_set_pte(&gtt_base[i], scratch_pte);
>>> -}
>>> -
>>>    static void bxt_vtd_ggtt_wa(struct i915_address_space *vm)
>>>    {
>>>        /*
>>> @@ -898,8 +889,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)
>>>        ggtt->vm.cleanup = gen6_gmch_remove;
>>>        ggtt->vm.insert_page = gen8_ggtt_insert_page;
>>>        ggtt->vm.clear_range = nop_clear_range;
>>> -     if (intel_scanout_needs_vtd_wa(i915))
>>> -             ggtt->vm.clear_range = gen8_ggtt_clear_range;
>>>    
>>>        ggtt->vm.insert_entries = gen8_ggtt_insert_entries;
>>>    
>>> @@ -1045,7 +1034,7 @@ static int gen6_gmch_probe(struct i915_ggtt *ggtt)
>>>        ggtt->vm.alloc_pt_dma = alloc_pt_dma;
>>>    
>>>        ggtt->vm.clear_range = nop_clear_range;
>>> -     if (!HAS_FULL_PPGTT(i915) || intel_scanout_needs_vtd_wa(i915))
>>> +     if (!HAS_FULL_PPGTT(i915))
>>>                ggtt->vm.clear_range = gen6_ggtt_clear_range;
>>>        ggtt->vm.insert_page = gen6_ggtt_insert_page;
>>>        ggtt->vm.insert_entries = gen6_ggtt_insert_entries;
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
>>> index c9b0ee5e1d23..8a2dfc7144cf 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>>> @@ -41,6 +41,7 @@ int i915_gem_gtt_insert(struct i915_address_space *vm,
>>>    #define PIN_HIGH            BIT_ULL(5)
>>>    #define PIN_OFFSET_BIAS             BIT_ULL(6)
>>>    #define PIN_OFFSET_FIXED    BIT_ULL(7)
>>> +#define PIN_VTD                      BIT_ULL(8)
>>>    
>>>    #define PIN_GLOBAL          BIT_ULL(10) /* I915_VMA_GLOBAL_BIND */
>>>    #define PIN_USER            BIT_ULL(11) /* I915_VMA_LOCAL_BIND */
>>> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
>>> index caa9b041616b..dccd36ff1a6d 100644
>>> --- a/drivers/gpu/drm/i915/i915_vma.c
>>> +++ b/drivers/gpu/drm/i915/i915_vma.c
>>> @@ -38,6 +38,8 @@
>>>    #include "i915_trace.h"
>>>    #include "i915_vma.h"
>>>    
>>> +#define VTD_GUARD roundup_pow_of_two(160 * SZ_4K) /* 160 PTE padding */
>>> +
>>>    static struct i915_global_vma {
>>>        struct i915_global base;
>>>        struct kmem_cache *slab_vmas;
>>> @@ -552,6 +554,9 @@ bool i915_vma_misplaced(const struct i915_vma *vma,
>>>            vma->node.start != (flags & PIN_OFFSET_MASK))
>>>                return true;
>>>    
>>> +     if (flags & PIN_VTD && vma->guard < VTD_GUARD)
>>> +             return true;
>>> +
>>>        return false;
>>>    }
>>>    
>>> @@ -637,6 +642,13 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
>>>                                  alignment, vma->fence_alignment);
>>>        }
>>>    
>>> +     /* VT-d requires padding before/after the vma */
>>> +     if (flags & PIN_VTD) {
>>> +             alignment = max_t(typeof(alignment), alignment, VTD_GUARD);
>>> +             vma->guard = alignment;
>>> +             size += 2 * vma->guard;
>>> +     }
>>> +
>>>        GEM_BUG_ON(!IS_ALIGNED(size, I915_GTT_PAGE_SIZE));
>>>        GEM_BUG_ON(!IS_ALIGNED(alignment, I915_GTT_MIN_ALIGNMENT));
>>>        GEM_BUG_ON(!is_power_of_2(alignment));
>>> @@ -725,6 +737,11 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
>>>    
>>>        list_add_tail(&vma->vm_link, &vma->vm->bound_list);
>>>    
>>> +     if (flags & PIN_VTD) {
>>> +             vma->node.start += vma->guard;
>>> +             vma->node.size -= 2 * vma->guard;
>>> +     }
>>> +
>>
>> So we adjust the node to be twice as large, reserve it in the mm, and
>> then shrink it again here. Seems a little scary to modify
>> node.start/node.size after insertion?
> 
> Definitely scary. So far it feels like the most viable approach. At
> least the concept of including a red-zone around the vma/drm_mm_node as
> part of that structure seems useful, and familiar.

Throwing some color_adjust at it might be another option to consider. 
Maybe something like:

+static void i915_ggtt_color_adjust_vdt(const struct drm_mm_node *node,
+                                      unsigned long color,
+                                      u64 *start,
+                                      u64 *end)
+{
+       if (color == COLOR_VDT) {
+               *start += VDT_PADDING;
+               *end -= VDT_PADDING;
+               return;
+       }
+
+       if (node->allocated && node->color == COLOR_VDT)
+               *start += VDT_PADDING;
+
+       node = list_next_entry(node, node_list);
+       if (node->allocated && node->color == COLOR_VDT)
+               *end -= VDT_PADDING;
+}

But not sure if I would call that simpler. Plus we need to add more 
special casing in the eviction code. And I guess the cache coloring 
might also be active here, which might be nasty. Also we end up with a 
bunch of holes in the address space that are unusable, yet the mm search 
will keep hitting them anyway. Ok, I guess that's what you meant with 
not introducing "extra nodes". Hmmm.

>   
>> Just wondering if this might upset something like evict_for_node, where
>> we inspect the allocated nodes over some range, rather than holes, and
>> since we shrunk the node it might get confused into thinking the padding
>> is actually free space. Or maybe that doesn't really matter much for the
>> GGTT?
> 
> The stumbling block I had earlier then forgot about is the drm_mm_node's
> computed hole_size uses the adjacent drm_mm_node.start. So as we've
> adjusted the node.start, it will later on declare a bigger hole and
> reuse part of our guard pages. That we will then clobber on resume.
> 
> (There's also consequences for the address interval tree, but that will
> continue to work correctly as we won't move the node.start beyond the
> allocated space.)
> 
> Hmm. Fortunately, we have wrapped all users of vma->node.start, at least
> for ggtt, with i915_ggtt_offset(). So it is feasible to adjust the
> offset by vma->guard there.
> 
> Let's try that.
> -Chris
>