[PATCH 6/8] drm/amdgpu: meld together VM fragment and huge page handling

Mon Nov 12 15:16:25 UTC 2018

Am 09.11.18 um 13:13 schrieb Koenig, Christian:
> Adding Aaron who initially reported the problem with this patch and
> suspend/resume.
>
> Am 08.11.18 um 20:35 schrieb Samuel Pitoiset:
>> On 11/8/18 5:50 PM, Christian König wrote:
>>> Hi Samuel,
>>>
>>> yeah, that is a known issue and I was already looking into that all
>>> day today. Problem is that I can't reproduce it reliable.
>>>
>>> Do you have any hint how to trigger that? E.g. memory exhaustion maybe?
>> Hi Christian,
>>
>> Good to hear that you are already working on.
>>
>> The GPU hang with Rise of Tomb Raider is reliable on my side. It
>> always hangs while loading the benchmark. I use the Ultra High preset.
>>
>> Do you have access to that game? As mentioned, F1 2017 is also
>> affected but I used RoTR during my bisect.
> I was able to reproduce the problem with a memory stress test, but still
> have not the slightest idea what is going wrong here.

Ok, found the root cause. We are setting the huge page flag on the wrong 
level (at the parent instead of the child).

Currently testing a fix for this,
Christian.

>
> Going to investigate further,
> Christian.
>
>> Thanks.
>>
>>> Thanks,
>>> Christian.
>>>
>>> Am 08.11.18 um 17:17 schrieb Samuel Pitoiset:
>>>> Hi,
>>>>
>>>> This change introduces GPU hangs with F1 2017 and Rise of Tomb
>>>> Raider on RADV/GFX9. I bisected the issue.
>>>>
>>>> Can you have a look?
>>>>
>>>> Thanks!
>>>>
>>>> On 9/12/18 10:54 AM, Christian König wrote:
>>>>> This optimizes the generating of PTEs by walking the hierarchy only
>>>>> once
>>>>> for a range and making changes as necessary.
>>>>>
>>>>> It allows for both huge (2MB) as well giant (1GB) pages to be used on
>>>>> Vega and Raven.
>>>>>
>>>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>>>> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>>>> Acked-by: Junwei Zhang <Jerry.Zhang at amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 267
>>>>> ++++++++++++++++++---------------
>>>>>    1 file changed, 147 insertions(+), 120 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> index 747b6ff6fea7..328325324a1d 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> @@ -1488,46 +1488,76 @@ int amdgpu_vm_update_directories(struct
>>>>> amdgpu_device *adev,
>>>>>    }
>>>>>      /**
>>>>> - * amdgpu_vm_handle_huge_pages - handle updating the PD with huge
>>>>> pages
>>>>> + * amdgpu_vm_update_huge - figure out parameters for PTE updates
>>>>>     *
>>>>> - * @p: see amdgpu_pte_update_params definition
>>>>> - * @entry: vm_pt entry to check
>>>>> - * @parent: parent entry
>>>>> - * @nptes: number of PTEs updated with this operation
>>>>> - * @dst: destination address where the PTEs should point to
>>>>> - * @flags: access flags fro the PTEs
>>>>> - *
>>>>> - * Check if we can update the PD with a huge page.
>>>>> + * Make sure to set the right flags for the PTEs at the desired
>>>>> level.
>>>>>     */
>>>>> -static void amdgpu_vm_handle_huge_pages(struct
>>>>> amdgpu_pte_update_params *p,
>>>>> -                    struct amdgpu_vm_pt *entry,
>>>>> -                    struct amdgpu_vm_pt *parent,
>>>>> -                    unsigned nptes, uint64_t dst,
>>>>> -                    uint64_t flags)
>>>>> -{
>>>>> -    uint64_t pde;
>>>>> +static void amdgpu_vm_update_huge(struct amdgpu_pte_update_params
>>>>> *params,
>>>>> +                  struct amdgpu_bo *bo, unsigned level,
>>>>> +                  uint64_t pe, uint64_t addr,
>>>>> +                  unsigned count, uint32_t incr,
>>>>> +                  uint64_t flags)
>>>>>    -    /* In the case of a mixed PT the PDE must point to it*/
>>>>> -    if (p->adev->asic_type >= CHIP_VEGA10 && !p->src &&
>>>>> -        nptes == AMDGPU_VM_PTE_COUNT(p->adev)) {
>>>>> -        /* Set the huge page flag to stop scanning at this PDE */
>>>>> +{
>>>>> +    if (level != AMDGPU_VM_PTB) {
>>>>>            flags |= AMDGPU_PDE_PTE;
>>>>> +        amdgpu_gmc_get_vm_pde(params->adev, level, &addr, &flags);
>>>>>        }
>>>>>    -    if (!(flags & AMDGPU_PDE_PTE)) {
>>>>> -        if (entry->huge) {
>>>>> -            /* Add the entry to the relocated list to update it. */
>>>>> -            entry->huge = false;
>>>>> -            amdgpu_vm_bo_relocated(&entry->base);
>>>>> -        }
>>>>> +    amdgpu_vm_update_func(params, bo, pe, addr, count, incr, flags);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * amdgpu_vm_fragment - get fragment for PTEs
>>>>> + *
>>>>> + * @params: see amdgpu_pte_update_params definition
>>>>> + * @start: first PTE to handle
>>>>> + * @end: last PTE to handle
>>>>> + * @flags: hw mapping flags
>>>>> + * @frag: resulting fragment size
>>>>> + * @frag_end: end of this fragment
>>>>> + *
>>>>> + * Returns the first possible fragment for the start and end address.
>>>>> + */
>>>>> +static void amdgpu_vm_fragment(struct amdgpu_pte_update_params
>>>>> *params,
>>>>> +                   uint64_t start, uint64_t end, uint64_t flags,
>>>>> +                   unsigned int *frag, uint64_t *frag_end)
>>>>> +{
>>>>> +    /**
>>>>> +     * The MC L1 TLB supports variable sized pages, based on a
>>>>> fragment
>>>>> +     * field in the PTE. When this field is set to a non-zero
>>>>> value, page
>>>>> +     * granularity is increased from 4KB to (1 << (12 + frag)).
>>>>> The PTE
>>>>> +     * flags are considered valid for all PTEs within the fragment
>>>>> range
>>>>> +     * and corresponding mappings are assumed to be physically
>>>>> contiguous.
>>>>> +     *
>>>>> +     * The L1 TLB can store a single PTE for the whole fragment,
>>>>> +     * significantly increasing the space available for translation
>>>>> +     * caching. This leads to large improvements in throughput
>>>>> when the
>>>>> +     * TLB is under pressure.
>>>>> +     *
>>>>> +     * The L2 TLB distributes small and large fragments into two
>>>>> +     * asymmetric partitions. The large fragment cache is
>>>>> significantly
>>>>> +     * larger. Thus, we try to use large fragments wherever possible.
>>>>> +     * Userspace can support this by aligning virtual base address
>>>>> and
>>>>> +     * allocation size to the fragment size.
>>>>> +     */
>>>>> +    unsigned max_frag = params->adev->vm_manager.fragment_size;
>>>>> +
>>>>> +    /* system pages are non continuously */
>>>>> +    if (params->src || !(flags & AMDGPU_PTE_VALID)) {
>>>>> +        *frag = 0;
>>>>> +        *frag_end = end;
>>>>>            return;
>>>>>        }
>>>>>    -    entry->huge = true;
>>>>> -    amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags);
>>>>> -
>>>>> -    pde = (entry - parent->entries) * 8;
>>>>> -    amdgpu_vm_update_func(p, parent->base.bo, pde, dst, 1, 0, flags);
>>>>> +    /* This intentionally wraps around if no bit is set */
>>>>> +    *frag = min((unsigned)ffs(start) - 1, (unsigned)fls64(end -
>>>>> start) - 1);
>>>>> +    if (*frag >= max_frag) {
>>>>> +        *frag = max_frag;
>>>>> +        *frag_end = end & ~((1ULL << max_frag) - 1);
>>>>> +    } else {
>>>>> +        *frag_end = start + (1 << *frag);
>>>>> +    }
>>>>>    }
>>>>>      /**
>>>>> @@ -1545,108 +1575,105 @@ static void
>>>>> amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
>>>>>     * 0 for success, -EINVAL for failure.
>>>>>     */
>>>>>    static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params
>>>>> *params,
>>>>> -                  uint64_t start, uint64_t end,
>>>>> -                  uint64_t dst, uint64_t flags)
>>>>> +                 uint64_t start, uint64_t end,
>>>>> +                 uint64_t dst, uint64_t flags)
>>>>>    {
>>>>>        struct amdgpu_device *adev = params->adev;
>>>>> -    const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
>>>>>        struct amdgpu_vm_pt_cursor cursor;
>>>>> +    uint64_t frag_start = start, frag_end;
>>>>> +    unsigned int frag;
>>>>>    -    /* walk over the address space and update the page tables */
>>>>> -    for_each_amdgpu_vm_pt_leaf(adev, params->vm, start, end - 1,
>>>>> cursor) {
>>>>> +    /* figure out the initial fragment */
>>>>> +    amdgpu_vm_fragment(params, frag_start, end, flags, &frag,
>>>>> &frag_end);
>>>>> +
>>>>> +    /* walk over the address space and update the PTs */
>>>>> +    amdgpu_vm_pt_start(adev, params->vm, start, &cursor);
>>>>> +    while (cursor.pfn < end) {
>>>>>            struct amdgpu_bo *pt = cursor.entry->base.bo;
>>>>> -        uint64_t pe_start;
>>>>> -        unsigned nptes;
>>>>> +        unsigned shift, parent_shift, num_entries;
>>>>> +        uint64_t incr, entry_end, pe_start;
>>>>>    -        if (!pt || cursor.level != AMDGPU_VM_PTB)
>>>>> +        if (!pt)
>>>>>                return -ENOENT;
>>>>>    -        if ((cursor.pfn & ~mask) == (end & ~mask))
>>>>> -            nptes = end - cursor.pfn;
>>>>> -        else
>>>>> -            nptes = AMDGPU_VM_PTE_COUNT(adev) - (cursor.pfn & mask);
>>>>> -
>>>>> -        amdgpu_vm_handle_huge_pages(params, cursor.entry,
>>>>> cursor.parent,
>>>>> -                        nptes, dst, flags);
>>>>> -        /* We don't need to update PTEs for huge pages */
>>>>> -        if (cursor.entry->huge) {
>>>>> -            dst += nptes * AMDGPU_GPU_PAGE_SIZE;
>>>>> +        /* The root level can't be a huge page */
>>>>> +        if (cursor.level == adev->vm_manager.root_level) {
>>>>> +            if (!amdgpu_vm_pt_descendant(adev, &cursor))
>>>>> +                return -ENOENT;
>>>>>                continue;
>>>>>            }
>>>>>    -        pe_start = (cursor.pfn & mask) * 8;
>>>>> -        amdgpu_vm_update_func(params, pt, pe_start, dst, nptes,
>>>>> -                      AMDGPU_GPU_PAGE_SIZE, flags);
>>>>> -        dst += nptes * AMDGPU_GPU_PAGE_SIZE;
>>>>> -    }
>>>>> -
>>>>> -    return 0;
>>>>> -}
>>>>> +        /* First check if the entry is already handled */
>>>>> +        if (cursor.pfn < frag_start) {
>>>>> +            cursor.entry->huge = true;
>>>>> +            amdgpu_vm_pt_next(adev, &cursor);
>>>>> +            continue;
>>>>> +        }
>>>>>    -/*
>>>>> - * amdgpu_vm_frag_ptes - add fragment information to PTEs
>>>>> - *
>>>>> - * @params: see amdgpu_pte_update_params definition
>>>>> - * @vm: requested vm
>>>>> - * @start: first PTE to handle
>>>>> - * @end: last PTE to handle
>>>>> - * @dst: addr those PTEs should point to
>>>>> - * @flags: hw mapping flags
>>>>> - *
>>>>> - * Returns:
>>>>> - * 0 for success, -EINVAL for failure.
>>>>> - */
>>>>> -static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params
>>>>> *params,
>>>>> -                uint64_t start, uint64_t end,
>>>>> -                uint64_t dst, uint64_t flags)
>>>>> -{
>>>>> -    /**
>>>>> -     * The MC L1 TLB supports variable sized pages, based on a
>>>>> fragment
>>>>> -     * field in the PTE. When this field is set to a non-zero
>>>>> value, page
>>>>> -     * granularity is increased from 4KB to (1 << (12 + frag)).
>>>>> The PTE
>>>>> -     * flags are considered valid for all PTEs within the fragment
>>>>> range
>>>>> -     * and corresponding mappings are assumed to be physically
>>>>> contiguous.
>>>>> -     *
>>>>> -     * The L1 TLB can store a single PTE for the whole fragment,
>>>>> -     * significantly increasing the space available for translation
>>>>> -     * caching. This leads to large improvements in throughput
>>>>> when the
>>>>> -     * TLB is under pressure.
>>>>> -     *
>>>>> -     * The L2 TLB distributes small and large fragments into two
>>>>> -     * asymmetric partitions. The large fragment cache is
>>>>> significantly
>>>>> -     * larger. Thus, we try to use large fragments wherever possible.
>>>>> -     * Userspace can support this by aligning virtual base address
>>>>> and
>>>>> -     * allocation size to the fragment size.
>>>>> -     */
>>>>> -    unsigned max_frag = params->adev->vm_manager.fragment_size;
>>>>> -    int r;
>>>>> +        /* If it isn't already handled it can't be a huge page */
>>>>> +        if (cursor.entry->huge) {
>>>>> +            /* Add the entry to the relocated list to update it. */
>>>>> +            cursor.entry->huge = false;
>>>>> + amdgpu_vm_bo_relocated(&cursor.entry->base);
>>>>> +        }
>>>>>    -    /* system pages are non continuously */
>>>>> -    if (params->src || !(flags & AMDGPU_PTE_VALID))
>>>>> -        return amdgpu_vm_update_ptes(params, start, end, dst, flags);
>>>>> -
>>>>> -    while (start != end) {
>>>>> -        uint64_t frag_flags, frag_end;
>>>>> -        unsigned frag;
>>>>> -
>>>>> -        /* This intentionally wraps around if no bit is set */
>>>>> -        frag = min((unsigned)ffs(start) - 1,
>>>>> -               (unsigned)fls64(end - start) - 1);
>>>>> -        if (frag >= max_frag) {
>>>>> -            frag_flags = AMDGPU_PTE_FRAG(max_frag);
>>>>> -            frag_end = end & ~((1ULL << max_frag) - 1);
>>>>> -        } else {
>>>>> -            frag_flags = AMDGPU_PTE_FRAG(frag);
>>>>> -            frag_end = start + (1 << frag);
>>>>> +        shift = amdgpu_vm_level_shift(adev, cursor.level);
>>>>> +        parent_shift = amdgpu_vm_level_shift(adev, cursor.level - 1);
>>>>> +        if (adev->asic_type < CHIP_VEGA10) {
>>>>> +            /* No huge page support before GMC v9 */
>>>>> +            if (cursor.level != AMDGPU_VM_PTB) {
>>>>> +                if (!amdgpu_vm_pt_descendant(adev, &cursor))
>>>>> +                    return -ENOENT;
>>>>> +                continue;
>>>>> +            }
>>>>> +        } else if (frag < shift) {
>>>>> +            /* We can't use this level when the fragment size is
>>>>> +             * smaller than the address shift. Go to the next
>>>>> +             * child entry and try again.
>>>>> +             */
>>>>> +            if (!amdgpu_vm_pt_descendant(adev, &cursor))
>>>>> +                return -ENOENT;
>>>>> +            continue;
>>>>> +        } else if (frag >= parent_shift) {
>>>>> +            /* If the fragment size is even larger than the parent
>>>>> +             * shift we should go up one level and check it again.
>>>>> +             */
>>>>> +            if (!amdgpu_vm_pt_ancestor(&cursor))
>>>>> +                return -ENOENT;
>>>>> +            continue;
>>>>>            }
>>>>>    -        r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
>>>>> -                      flags | frag_flags);
>>>>> -        if (r)
>>>>> -            return r;
>>>>> +        /* Looks good so far, calculate parameters for the update */
>>>>> +        incr = AMDGPU_GPU_PAGE_SIZE << shift;
>>>>> +        num_entries = amdgpu_vm_num_entries(adev, cursor.level);
>>>>> +        pe_start = ((cursor.pfn >> shift) & (num_entries - 1)) * 8;
>>>>> +        entry_end = num_entries << shift;
>>>>> +        entry_end += cursor.pfn & ~(entry_end - 1);
>>>>> +        entry_end = min(entry_end, end);
>>>>> +
>>>>> +        do {
>>>>> +            uint64_t upd_end = min(entry_end, frag_end);
>>>>> +            unsigned nptes = (upd_end - frag_start) >> shift;
>>>>> +
>>>>> +            amdgpu_vm_update_huge(params, pt, cursor.level,
>>>>> +                          pe_start, dst, nptes, incr,
>>>>> +                          flags | AMDGPU_PTE_FRAG(frag));
>>>>> +
>>>>> +            pe_start += nptes * 8;
>>>>> +            dst += nptes * AMDGPU_GPU_PAGE_SIZE << shift;
>>>>> +
>>>>> +            frag_start = upd_end;
>>>>> +            if (frag_start >= frag_end) {
>>>>> +                /* figure out the next fragment */
>>>>> +                amdgpu_vm_fragment(params, frag_start, end,
>>>>> +                           flags, &frag, &frag_end);
>>>>> +                if (frag < shift)
>>>>> +                    break;
>>>>> +            }
>>>>> +        } while (frag_start < entry_end);
>>>>>    -        dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
>>>>> -        start = frag_end;
>>>>> +        if (frag >= shift)
>>>>> +            amdgpu_vm_pt_next(adev, &cursor);
>>>>>        }
>>>>>          return 0;
>>>>> @@ -1708,8 +1735,8 @@ static int amdgpu_vm_bo_update_mapping(struct
>>>>> amdgpu_device *adev,
>>>>>              params.func = amdgpu_vm_cpu_set_ptes;
>>>>>            params.pages_addr = pages_addr;
>>>>> -        return amdgpu_vm_frag_ptes(&params, start, last + 1,
>>>>> -                       addr, flags);
>>>>> +        return amdgpu_vm_update_ptes(&params, start, last + 1,
>>>>> +                         addr, flags);
>>>>>        }
>>>>>          ring = container_of(vm->entity.rq->sched, struct
>>>>> amdgpu_ring, sched);
>>>>> @@ -1788,7 +1815,7 @@ static int amdgpu_vm_bo_update_mapping(struct
>>>>> amdgpu_device *adev,
>>>>>        if (r)
>>>>>            goto error_free;
>>>>>    -    r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
>>>>> +    r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags);
>>>>>        if (r)
>>>>>            goto error_free;
>>>>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx