[PATCH 6/8] drm/amdgpu: meld together VM fragment and huge page handling

Fri Nov 9 12:13:02 UTC 2018

Adding Aaron who initially reported the problem with this patch and 
suspend/resume.

Am 08.11.18 um 20:35 schrieb Samuel Pitoiset:
> On 11/8/18 5:50 PM, Christian König wrote:
>> Hi Samuel,
>>
>> yeah, that is a known issue and I was already looking into that all 
>> day today. Problem is that I can't reproduce it reliable.
>>
>> Do you have any hint how to trigger that? E.g. memory exhaustion maybe?
>
> Hi Christian,
>
> Good to hear that you are already working on.
>
> The GPU hang with Rise of Tomb Raider is reliable on my side. It 
> always hangs while loading the benchmark. I use the Ultra High preset.
>
> Do you have access to that game? As mentioned, F1 2017 is also 
> affected but I used RoTR during my bisect.

I was able to reproduce the problem with a memory stress test, but still 
have not the slightest idea what is going wrong here.

Going to investigate further,
Christian.

>
> Thanks.
>
>>
>> Thanks,
>> Christian.
>>
>> Am 08.11.18 um 17:17 schrieb Samuel Pitoiset:
>>> Hi,
>>>
>>> This change introduces GPU hangs with F1 2017 and Rise of Tomb 
>>> Raider on RADV/GFX9. I bisected the issue.
>>>
>>> Can you have a look?
>>>
>>> Thanks!
>>>
>>> On 9/12/18 10:54 AM, Christian König wrote:
>>>> This optimizes the generating of PTEs by walking the hierarchy only 
>>>> once
>>>> for a range and making changes as necessary.
>>>>
>>>> It allows for both huge (2MB) as well giant (1GB) pages to be used on
>>>> Vega and Raven.
>>>>
>>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>>> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>>> Acked-by: Junwei Zhang <Jerry.Zhang at amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 267 
>>>> ++++++++++++++++++---------------
>>>>   1 file changed, 147 insertions(+), 120 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index 747b6ff6fea7..328325324a1d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -1488,46 +1488,76 @@ int amdgpu_vm_update_directories(struct 
>>>> amdgpu_device *adev,
>>>>   }
>>>>     /**
>>>> - * amdgpu_vm_handle_huge_pages - handle updating the PD with huge 
>>>> pages
>>>> + * amdgpu_vm_update_huge - figure out parameters for PTE updates
>>>>    *
>>>> - * @p: see amdgpu_pte_update_params definition
>>>> - * @entry: vm_pt entry to check
>>>> - * @parent: parent entry
>>>> - * @nptes: number of PTEs updated with this operation
>>>> - * @dst: destination address where the PTEs should point to
>>>> - * @flags: access flags fro the PTEs
>>>> - *
>>>> - * Check if we can update the PD with a huge page.
>>>> + * Make sure to set the right flags for the PTEs at the desired 
>>>> level.
>>>>    */
>>>> -static void amdgpu_vm_handle_huge_pages(struct 
>>>> amdgpu_pte_update_params *p,
>>>> -                    struct amdgpu_vm_pt *entry,
>>>> -                    struct amdgpu_vm_pt *parent,
>>>> -                    unsigned nptes, uint64_t dst,
>>>> -                    uint64_t flags)
>>>> -{
>>>> -    uint64_t pde;
>>>> +static void amdgpu_vm_update_huge(struct amdgpu_pte_update_params 
>>>> *params,
>>>> +                  struct amdgpu_bo *bo, unsigned level,
>>>> +                  uint64_t pe, uint64_t addr,
>>>> +                  unsigned count, uint32_t incr,
>>>> +                  uint64_t flags)
>>>>   -    /* In the case of a mixed PT the PDE must point to it*/
>>>> -    if (p->adev->asic_type >= CHIP_VEGA10 && !p->src &&
>>>> -        nptes == AMDGPU_VM_PTE_COUNT(p->adev)) {
>>>> -        /* Set the huge page flag to stop scanning at this PDE */
>>>> +{
>>>> +    if (level != AMDGPU_VM_PTB) {
>>>>           flags |= AMDGPU_PDE_PTE;
>>>> +        amdgpu_gmc_get_vm_pde(params->adev, level, &addr, &flags);
>>>>       }
>>>>   -    if (!(flags & AMDGPU_PDE_PTE)) {
>>>> -        if (entry->huge) {
>>>> -            /* Add the entry to the relocated list to update it. */
>>>> -            entry->huge = false;
>>>> -            amdgpu_vm_bo_relocated(&entry->base);
>>>> -        }
>>>> +    amdgpu_vm_update_func(params, bo, pe, addr, count, incr, flags);
>>>> +}
>>>> +
>>>> +/**
>>>> + * amdgpu_vm_fragment - get fragment for PTEs
>>>> + *
>>>> + * @params: see amdgpu_pte_update_params definition
>>>> + * @start: first PTE to handle
>>>> + * @end: last PTE to handle
>>>> + * @flags: hw mapping flags
>>>> + * @frag: resulting fragment size
>>>> + * @frag_end: end of this fragment
>>>> + *
>>>> + * Returns the first possible fragment for the start and end address.
>>>> + */
>>>> +static void amdgpu_vm_fragment(struct amdgpu_pte_update_params 
>>>> *params,
>>>> +                   uint64_t start, uint64_t end, uint64_t flags,
>>>> +                   unsigned int *frag, uint64_t *frag_end)
>>>> +{
>>>> +    /**
>>>> +     * The MC L1 TLB supports variable sized pages, based on a 
>>>> fragment
>>>> +     * field in the PTE. When this field is set to a non-zero 
>>>> value, page
>>>> +     * granularity is increased from 4KB to (1 << (12 + frag)). 
>>>> The PTE
>>>> +     * flags are considered valid for all PTEs within the fragment 
>>>> range
>>>> +     * and corresponding mappings are assumed to be physically 
>>>> contiguous.
>>>> +     *
>>>> +     * The L1 TLB can store a single PTE for the whole fragment,
>>>> +     * significantly increasing the space available for translation
>>>> +     * caching. This leads to large improvements in throughput 
>>>> when the
>>>> +     * TLB is under pressure.
>>>> +     *
>>>> +     * The L2 TLB distributes small and large fragments into two
>>>> +     * asymmetric partitions. The large fragment cache is 
>>>> significantly
>>>> +     * larger. Thus, we try to use large fragments wherever possible.
>>>> +     * Userspace can support this by aligning virtual base address 
>>>> and
>>>> +     * allocation size to the fragment size.
>>>> +     */
>>>> +    unsigned max_frag = params->adev->vm_manager.fragment_size;
>>>> +
>>>> +    /* system pages are non continuously */
>>>> +    if (params->src || !(flags & AMDGPU_PTE_VALID)) {
>>>> +        *frag = 0;
>>>> +        *frag_end = end;
>>>>           return;
>>>>       }
>>>>   -    entry->huge = true;
>>>> -    amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags);
>>>> -
>>>> -    pde = (entry - parent->entries) * 8;
>>>> -    amdgpu_vm_update_func(p, parent->base.bo, pde, dst, 1, 0, flags);
>>>> +    /* This intentionally wraps around if no bit is set */
>>>> +    *frag = min((unsigned)ffs(start) - 1, (unsigned)fls64(end - 
>>>> start) - 1);
>>>> +    if (*frag >= max_frag) {
>>>> +        *frag = max_frag;
>>>> +        *frag_end = end & ~((1ULL << max_frag) - 1);
>>>> +    } else {
>>>> +        *frag_end = start + (1 << *frag);
>>>> +    }
>>>>   }
>>>>     /**
>>>> @@ -1545,108 +1575,105 @@ static void 
>>>> amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
>>>>    * 0 for success, -EINVAL for failure.
>>>>    */
>>>>   static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params 
>>>> *params,
>>>> -                  uint64_t start, uint64_t end,
>>>> -                  uint64_t dst, uint64_t flags)
>>>> +                 uint64_t start, uint64_t end,
>>>> +                 uint64_t dst, uint64_t flags)
>>>>   {
>>>>       struct amdgpu_device *adev = params->adev;
>>>> -    const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
>>>>       struct amdgpu_vm_pt_cursor cursor;
>>>> +    uint64_t frag_start = start, frag_end;
>>>> +    unsigned int frag;
>>>>   -    /* walk over the address space and update the page tables */
>>>> -    for_each_amdgpu_vm_pt_leaf(adev, params->vm, start, end - 1, 
>>>> cursor) {
>>>> +    /* figure out the initial fragment */
>>>> +    amdgpu_vm_fragment(params, frag_start, end, flags, &frag, 
>>>> &frag_end);
>>>> +
>>>> +    /* walk over the address space and update the PTs */
>>>> +    amdgpu_vm_pt_start(adev, params->vm, start, &cursor);
>>>> +    while (cursor.pfn < end) {
>>>>           struct amdgpu_bo *pt = cursor.entry->base.bo;
>>>> -        uint64_t pe_start;
>>>> -        unsigned nptes;
>>>> +        unsigned shift, parent_shift, num_entries;
>>>> +        uint64_t incr, entry_end, pe_start;
>>>>   -        if (!pt || cursor.level != AMDGPU_VM_PTB)
>>>> +        if (!pt)
>>>>               return -ENOENT;
>>>>   -        if ((cursor.pfn & ~mask) == (end & ~mask))
>>>> -            nptes = end - cursor.pfn;
>>>> -        else
>>>> -            nptes = AMDGPU_VM_PTE_COUNT(adev) - (cursor.pfn & mask);
>>>> -
>>>> -        amdgpu_vm_handle_huge_pages(params, cursor.entry, 
>>>> cursor.parent,
>>>> -                        nptes, dst, flags);
>>>> -        /* We don't need to update PTEs for huge pages */
>>>> -        if (cursor.entry->huge) {
>>>> -            dst += nptes * AMDGPU_GPU_PAGE_SIZE;
>>>> +        /* The root level can't be a huge page */
>>>> +        if (cursor.level == adev->vm_manager.root_level) {
>>>> +            if (!amdgpu_vm_pt_descendant(adev, &cursor))
>>>> +                return -ENOENT;
>>>>               continue;
>>>>           }
>>>>   -        pe_start = (cursor.pfn & mask) * 8;
>>>> -        amdgpu_vm_update_func(params, pt, pe_start, dst, nptes,
>>>> -                      AMDGPU_GPU_PAGE_SIZE, flags);
>>>> -        dst += nptes * AMDGPU_GPU_PAGE_SIZE;
>>>> -    }
>>>> -
>>>> -    return 0;
>>>> -}
>>>> +        /* First check if the entry is already handled */
>>>> +        if (cursor.pfn < frag_start) {
>>>> +            cursor.entry->huge = true;
>>>> +            amdgpu_vm_pt_next(adev, &cursor);
>>>> +            continue;
>>>> +        }
>>>>   -/*
>>>> - * amdgpu_vm_frag_ptes - add fragment information to PTEs
>>>> - *
>>>> - * @params: see amdgpu_pte_update_params definition
>>>> - * @vm: requested vm
>>>> - * @start: first PTE to handle
>>>> - * @end: last PTE to handle
>>>> - * @dst: addr those PTEs should point to
>>>> - * @flags: hw mapping flags
>>>> - *
>>>> - * Returns:
>>>> - * 0 for success, -EINVAL for failure.
>>>> - */
>>>> -static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params 
>>>> *params,
>>>> -                uint64_t start, uint64_t end,
>>>> -                uint64_t dst, uint64_t flags)
>>>> -{
>>>> -    /**
>>>> -     * The MC L1 TLB supports variable sized pages, based on a 
>>>> fragment
>>>> -     * field in the PTE. When this field is set to a non-zero 
>>>> value, page
>>>> -     * granularity is increased from 4KB to (1 << (12 + frag)). 
>>>> The PTE
>>>> -     * flags are considered valid for all PTEs within the fragment 
>>>> range
>>>> -     * and corresponding mappings are assumed to be physically 
>>>> contiguous.
>>>> -     *
>>>> -     * The L1 TLB can store a single PTE for the whole fragment,
>>>> -     * significantly increasing the space available for translation
>>>> -     * caching. This leads to large improvements in throughput 
>>>> when the
>>>> -     * TLB is under pressure.
>>>> -     *
>>>> -     * The L2 TLB distributes small and large fragments into two
>>>> -     * asymmetric partitions. The large fragment cache is 
>>>> significantly
>>>> -     * larger. Thus, we try to use large fragments wherever possible.
>>>> -     * Userspace can support this by aligning virtual base address 
>>>> and
>>>> -     * allocation size to the fragment size.
>>>> -     */
>>>> -    unsigned max_frag = params->adev->vm_manager.fragment_size;
>>>> -    int r;
>>>> +        /* If it isn't already handled it can't be a huge page */
>>>> +        if (cursor.entry->huge) {
>>>> +            /* Add the entry to the relocated list to update it. */
>>>> +            cursor.entry->huge = false;
>>>> + amdgpu_vm_bo_relocated(&cursor.entry->base);
>>>> +        }
>>>>   -    /* system pages are non continuously */
>>>> -    if (params->src || !(flags & AMDGPU_PTE_VALID))
>>>> -        return amdgpu_vm_update_ptes(params, start, end, dst, flags);
>>>> -
>>>> -    while (start != end) {
>>>> -        uint64_t frag_flags, frag_end;
>>>> -        unsigned frag;
>>>> -
>>>> -        /* This intentionally wraps around if no bit is set */
>>>> -        frag = min((unsigned)ffs(start) - 1,
>>>> -               (unsigned)fls64(end - start) - 1);
>>>> -        if (frag >= max_frag) {
>>>> -            frag_flags = AMDGPU_PTE_FRAG(max_frag);
>>>> -            frag_end = end & ~((1ULL << max_frag) - 1);
>>>> -        } else {
>>>> -            frag_flags = AMDGPU_PTE_FRAG(frag);
>>>> -            frag_end = start + (1 << frag);
>>>> +        shift = amdgpu_vm_level_shift(adev, cursor.level);
>>>> +        parent_shift = amdgpu_vm_level_shift(adev, cursor.level - 1);
>>>> +        if (adev->asic_type < CHIP_VEGA10) {
>>>> +            /* No huge page support before GMC v9 */
>>>> +            if (cursor.level != AMDGPU_VM_PTB) {
>>>> +                if (!amdgpu_vm_pt_descendant(adev, &cursor))
>>>> +                    return -ENOENT;
>>>> +                continue;
>>>> +            }
>>>> +        } else if (frag < shift) {
>>>> +            /* We can't use this level when the fragment size is
>>>> +             * smaller than the address shift. Go to the next
>>>> +             * child entry and try again.
>>>> +             */
>>>> +            if (!amdgpu_vm_pt_descendant(adev, &cursor))
>>>> +                return -ENOENT;
>>>> +            continue;
>>>> +        } else if (frag >= parent_shift) {
>>>> +            /* If the fragment size is even larger than the parent
>>>> +             * shift we should go up one level and check it again.
>>>> +             */
>>>> +            if (!amdgpu_vm_pt_ancestor(&cursor))
>>>> +                return -ENOENT;
>>>> +            continue;
>>>>           }
>>>>   -        r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
>>>> -                      flags | frag_flags);
>>>> -        if (r)
>>>> -            return r;
>>>> +        /* Looks good so far, calculate parameters for the update */
>>>> +        incr = AMDGPU_GPU_PAGE_SIZE << shift;
>>>> +        num_entries = amdgpu_vm_num_entries(adev, cursor.level);
>>>> +        pe_start = ((cursor.pfn >> shift) & (num_entries - 1)) * 8;
>>>> +        entry_end = num_entries << shift;
>>>> +        entry_end += cursor.pfn & ~(entry_end - 1);
>>>> +        entry_end = min(entry_end, end);
>>>> +
>>>> +        do {
>>>> +            uint64_t upd_end = min(entry_end, frag_end);
>>>> +            unsigned nptes = (upd_end - frag_start) >> shift;
>>>> +
>>>> +            amdgpu_vm_update_huge(params, pt, cursor.level,
>>>> +                          pe_start, dst, nptes, incr,
>>>> +                          flags | AMDGPU_PTE_FRAG(frag));
>>>> +
>>>> +            pe_start += nptes * 8;
>>>> +            dst += nptes * AMDGPU_GPU_PAGE_SIZE << shift;
>>>> +
>>>> +            frag_start = upd_end;
>>>> +            if (frag_start >= frag_end) {
>>>> +                /* figure out the next fragment */
>>>> +                amdgpu_vm_fragment(params, frag_start, end,
>>>> +                           flags, &frag, &frag_end);
>>>> +                if (frag < shift)
>>>> +                    break;
>>>> +            }
>>>> +        } while (frag_start < entry_end);
>>>>   -        dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
>>>> -        start = frag_end;
>>>> +        if (frag >= shift)
>>>> +            amdgpu_vm_pt_next(adev, &cursor);
>>>>       }
>>>>         return 0;
>>>> @@ -1708,8 +1735,8 @@ static int amdgpu_vm_bo_update_mapping(struct 
>>>> amdgpu_device *adev,
>>>>             params.func = amdgpu_vm_cpu_set_ptes;
>>>>           params.pages_addr = pages_addr;
>>>> -        return amdgpu_vm_frag_ptes(&params, start, last + 1,
>>>> -                       addr, flags);
>>>> +        return amdgpu_vm_update_ptes(&params, start, last + 1,
>>>> +                         addr, flags);
>>>>       }
>>>>         ring = container_of(vm->entity.rq->sched, struct 
>>>> amdgpu_ring, sched);
>>>> @@ -1788,7 +1815,7 @@ static int amdgpu_vm_bo_update_mapping(struct 
>>>> amdgpu_device *adev,
>>>>       if (r)
>>>>           goto error_free;
>>>>   -    r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
>>>> +    r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags);
>>>>       if (r)
>>>>           goto error_free;
>>>>
>>