[PATCH 6/8] drm/amdgpu: meld together VM fragment and huge page handling

Thu Nov 8 16:50:31 UTC 2018

Hi Samuel,

yeah, that is a known issue and I was already looking into that all day 
today. Problem is that I can't reproduce it reliable.

Do you have any hint how to trigger that? E.g. memory exhaustion maybe?

Thanks,
Christian.

Am 08.11.18 um 17:17 schrieb Samuel Pitoiset:
> Hi,
>
> This change introduces GPU hangs with F1 2017 and Rise of Tomb Raider 
> on RADV/GFX9. I bisected the issue.
>
> Can you have a look?
>
> Thanks!
>
> On 9/12/18 10:54 AM, Christian König wrote:
>> This optimizes the generating of PTEs by walking the hierarchy only once
>> for a range and making changes as necessary.
>>
>> It allows for both huge (2MB) as well giant (1GB) pages to be used on
>> Vega and Raven.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>> Acked-by: Junwei Zhang <Jerry.Zhang at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 267 
>> ++++++++++++++++++---------------
>>   1 file changed, 147 insertions(+), 120 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 747b6ff6fea7..328325324a1d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1488,46 +1488,76 @@ int amdgpu_vm_update_directories(struct 
>> amdgpu_device *adev,
>>   }
>>     /**
>> - * amdgpu_vm_handle_huge_pages - handle updating the PD with huge pages
>> + * amdgpu_vm_update_huge - figure out parameters for PTE updates
>>    *
>> - * @p: see amdgpu_pte_update_params definition
>> - * @entry: vm_pt entry to check
>> - * @parent: parent entry
>> - * @nptes: number of PTEs updated with this operation
>> - * @dst: destination address where the PTEs should point to
>> - * @flags: access flags fro the PTEs
>> - *
>> - * Check if we can update the PD with a huge page.
>> + * Make sure to set the right flags for the PTEs at the desired level.
>>    */
>> -static void amdgpu_vm_handle_huge_pages(struct 
>> amdgpu_pte_update_params *p,
>> -                    struct amdgpu_vm_pt *entry,
>> -                    struct amdgpu_vm_pt *parent,
>> -                    unsigned nptes, uint64_t dst,
>> -                    uint64_t flags)
>> -{
>> -    uint64_t pde;
>> +static void amdgpu_vm_update_huge(struct amdgpu_pte_update_params 
>> *params,
>> +                  struct amdgpu_bo *bo, unsigned level,
>> +                  uint64_t pe, uint64_t addr,
>> +                  unsigned count, uint32_t incr,
>> +                  uint64_t flags)
>>   -    /* In the case of a mixed PT the PDE must point to it*/
>> -    if (p->adev->asic_type >= CHIP_VEGA10 && !p->src &&
>> -        nptes == AMDGPU_VM_PTE_COUNT(p->adev)) {
>> -        /* Set the huge page flag to stop scanning at this PDE */
>> +{
>> +    if (level != AMDGPU_VM_PTB) {
>>           flags |= AMDGPU_PDE_PTE;
>> +        amdgpu_gmc_get_vm_pde(params->adev, level, &addr, &flags);
>>       }
>>   -    if (!(flags & AMDGPU_PDE_PTE)) {
>> -        if (entry->huge) {
>> -            /* Add the entry to the relocated list to update it. */
>> -            entry->huge = false;
>> -            amdgpu_vm_bo_relocated(&entry->base);
>> -        }
>> +    amdgpu_vm_update_func(params, bo, pe, addr, count, incr, flags);
>> +}
>> +
>> +/**
>> + * amdgpu_vm_fragment - get fragment for PTEs
>> + *
>> + * @params: see amdgpu_pte_update_params definition
>> + * @start: first PTE to handle
>> + * @end: last PTE to handle
>> + * @flags: hw mapping flags
>> + * @frag: resulting fragment size
>> + * @frag_end: end of this fragment
>> + *
>> + * Returns the first possible fragment for the start and end address.
>> + */
>> +static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params,
>> +                   uint64_t start, uint64_t end, uint64_t flags,
>> +                   unsigned int *frag, uint64_t *frag_end)
>> +{
>> +    /**
>> +     * The MC L1 TLB supports variable sized pages, based on a fragment
>> +     * field in the PTE. When this field is set to a non-zero value, 
>> page
>> +     * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
>> +     * flags are considered valid for all PTEs within the fragment 
>> range
>> +     * and corresponding mappings are assumed to be physically 
>> contiguous.
>> +     *
>> +     * The L1 TLB can store a single PTE for the whole fragment,
>> +     * significantly increasing the space available for translation
>> +     * caching. This leads to large improvements in throughput when the
>> +     * TLB is under pressure.
>> +     *
>> +     * The L2 TLB distributes small and large fragments into two
>> +     * asymmetric partitions. The large fragment cache is significantly
>> +     * larger. Thus, we try to use large fragments wherever possible.
>> +     * Userspace can support this by aligning virtual base address and
>> +     * allocation size to the fragment size.
>> +     */
>> +    unsigned max_frag = params->adev->vm_manager.fragment_size;
>> +
>> +    /* system pages are non continuously */
>> +    if (params->src || !(flags & AMDGPU_PTE_VALID)) {
>> +        *frag = 0;
>> +        *frag_end = end;
>>           return;
>>       }
>>   -    entry->huge = true;
>> -    amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags);
>> -
>> -    pde = (entry - parent->entries) * 8;
>> -    amdgpu_vm_update_func(p, parent->base.bo, pde, dst, 1, 0, flags);
>> +    /* This intentionally wraps around if no bit is set */
>> +    *frag = min((unsigned)ffs(start) - 1, (unsigned)fls64(end - 
>> start) - 1);
>> +    if (*frag >= max_frag) {
>> +        *frag = max_frag;
>> +        *frag_end = end & ~((1ULL << max_frag) - 1);
>> +    } else {
>> +        *frag_end = start + (1 << *frag);
>> +    }
>>   }
>>     /**
>> @@ -1545,108 +1575,105 @@ static void 
>> amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
>>    * 0 for success, -EINVAL for failure.
>>    */
>>   static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params 
>> *params,
>> -                  uint64_t start, uint64_t end,
>> -                  uint64_t dst, uint64_t flags)
>> +                 uint64_t start, uint64_t end,
>> +                 uint64_t dst, uint64_t flags)
>>   {
>>       struct amdgpu_device *adev = params->adev;
>> -    const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
>>       struct amdgpu_vm_pt_cursor cursor;
>> +    uint64_t frag_start = start, frag_end;
>> +    unsigned int frag;
>>   -    /* walk over the address space and update the page tables */
>> -    for_each_amdgpu_vm_pt_leaf(adev, params->vm, start, end - 1, 
>> cursor) {
>> +    /* figure out the initial fragment */
>> +    amdgpu_vm_fragment(params, frag_start, end, flags, &frag, 
>> &frag_end);
>> +
>> +    /* walk over the address space and update the PTs */
>> +    amdgpu_vm_pt_start(adev, params->vm, start, &cursor);
>> +    while (cursor.pfn < end) {
>>           struct amdgpu_bo *pt = cursor.entry->base.bo;
>> -        uint64_t pe_start;
>> -        unsigned nptes;
>> +        unsigned shift, parent_shift, num_entries;
>> +        uint64_t incr, entry_end, pe_start;
>>   -        if (!pt || cursor.level != AMDGPU_VM_PTB)
>> +        if (!pt)
>>               return -ENOENT;
>>   -        if ((cursor.pfn & ~mask) == (end & ~mask))
>> -            nptes = end - cursor.pfn;
>> -        else
>> -            nptes = AMDGPU_VM_PTE_COUNT(adev) - (cursor.pfn & mask);
>> -
>> -        amdgpu_vm_handle_huge_pages(params, cursor.entry, 
>> cursor.parent,
>> -                        nptes, dst, flags);
>> -        /* We don't need to update PTEs for huge pages */
>> -        if (cursor.entry->huge) {
>> -            dst += nptes * AMDGPU_GPU_PAGE_SIZE;
>> +        /* The root level can't be a huge page */
>> +        if (cursor.level == adev->vm_manager.root_level) {
>> +            if (!amdgpu_vm_pt_descendant(adev, &cursor))
>> +                return -ENOENT;
>>               continue;
>>           }
>>   -        pe_start = (cursor.pfn & mask) * 8;
>> -        amdgpu_vm_update_func(params, pt, pe_start, dst, nptes,
>> -                      AMDGPU_GPU_PAGE_SIZE, flags);
>> -        dst += nptes * AMDGPU_GPU_PAGE_SIZE;
>> -    }
>> -
>> -    return 0;
>> -}
>> +        /* First check if the entry is already handled */
>> +        if (cursor.pfn < frag_start) {
>> +            cursor.entry->huge = true;
>> +            amdgpu_vm_pt_next(adev, &cursor);
>> +            continue;
>> +        }
>>   -/*
>> - * amdgpu_vm_frag_ptes - add fragment information to PTEs
>> - *
>> - * @params: see amdgpu_pte_update_params definition
>> - * @vm: requested vm
>> - * @start: first PTE to handle
>> - * @end: last PTE to handle
>> - * @dst: addr those PTEs should point to
>> - * @flags: hw mapping flags
>> - *
>> - * Returns:
>> - * 0 for success, -EINVAL for failure.
>> - */
>> -static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params    
>> *params,
>> -                uint64_t start, uint64_t end,
>> -                uint64_t dst, uint64_t flags)
>> -{
>> -    /**
>> -     * The MC L1 TLB supports variable sized pages, based on a fragment
>> -     * field in the PTE. When this field is set to a non-zero value, 
>> page
>> -     * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
>> -     * flags are considered valid for all PTEs within the fragment 
>> range
>> -     * and corresponding mappings are assumed to be physically 
>> contiguous.
>> -     *
>> -     * The L1 TLB can store a single PTE for the whole fragment,
>> -     * significantly increasing the space available for translation
>> -     * caching. This leads to large improvements in throughput when the
>> -     * TLB is under pressure.
>> -     *
>> -     * The L2 TLB distributes small and large fragments into two
>> -     * asymmetric partitions. The large fragment cache is significantly
>> -     * larger. Thus, we try to use large fragments wherever possible.
>> -     * Userspace can support this by aligning virtual base address and
>> -     * allocation size to the fragment size.
>> -     */
>> -    unsigned max_frag = params->adev->vm_manager.fragment_size;
>> -    int r;
>> +        /* If it isn't already handled it can't be a huge page */
>> +        if (cursor.entry->huge) {
>> +            /* Add the entry to the relocated list to update it. */
>> +            cursor.entry->huge = false;
>> +            amdgpu_vm_bo_relocated(&cursor.entry->base);
>> +        }
>>   -    /* system pages are non continuously */
>> -    if (params->src || !(flags & AMDGPU_PTE_VALID))
>> -        return amdgpu_vm_update_ptes(params, start, end, dst, flags);
>> -
>> -    while (start != end) {
>> -        uint64_t frag_flags, frag_end;
>> -        unsigned frag;
>> -
>> -        /* This intentionally wraps around if no bit is set */
>> -        frag = min((unsigned)ffs(start) - 1,
>> -               (unsigned)fls64(end - start) - 1);
>> -        if (frag >= max_frag) {
>> -            frag_flags = AMDGPU_PTE_FRAG(max_frag);
>> -            frag_end = end & ~((1ULL << max_frag) - 1);
>> -        } else {
>> -            frag_flags = AMDGPU_PTE_FRAG(frag);
>> -            frag_end = start + (1 << frag);
>> +        shift = amdgpu_vm_level_shift(adev, cursor.level);
>> +        parent_shift = amdgpu_vm_level_shift(adev, cursor.level - 1);
>> +        if (adev->asic_type < CHIP_VEGA10) {
>> +            /* No huge page support before GMC v9 */
>> +            if (cursor.level != AMDGPU_VM_PTB) {
>> +                if (!amdgpu_vm_pt_descendant(adev, &cursor))
>> +                    return -ENOENT;
>> +                continue;
>> +            }
>> +        } else if (frag < shift) {
>> +            /* We can't use this level when the fragment size is
>> +             * smaller than the address shift. Go to the next
>> +             * child entry and try again.
>> +             */
>> +            if (!amdgpu_vm_pt_descendant(adev, &cursor))
>> +                return -ENOENT;
>> +            continue;
>> +        } else if (frag >= parent_shift) {
>> +            /* If the fragment size is even larger than the parent
>> +             * shift we should go up one level and check it again.
>> +             */
>> +            if (!amdgpu_vm_pt_ancestor(&cursor))
>> +                return -ENOENT;
>> +            continue;
>>           }
>>   -        r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
>> -                      flags | frag_flags);
>> -        if (r)
>> -            return r;
>> +        /* Looks good so far, calculate parameters for the update */
>> +        incr = AMDGPU_GPU_PAGE_SIZE << shift;
>> +        num_entries = amdgpu_vm_num_entries(adev, cursor.level);
>> +        pe_start = ((cursor.pfn >> shift) & (num_entries - 1)) * 8;
>> +        entry_end = num_entries << shift;
>> +        entry_end += cursor.pfn & ~(entry_end - 1);
>> +        entry_end = min(entry_end, end);
>> +
>> +        do {
>> +            uint64_t upd_end = min(entry_end, frag_end);
>> +            unsigned nptes = (upd_end - frag_start) >> shift;
>> +
>> +            amdgpu_vm_update_huge(params, pt, cursor.level,
>> +                          pe_start, dst, nptes, incr,
>> +                          flags | AMDGPU_PTE_FRAG(frag));
>> +
>> +            pe_start += nptes * 8;
>> +            dst += nptes * AMDGPU_GPU_PAGE_SIZE << shift;
>> +
>> +            frag_start = upd_end;
>> +            if (frag_start >= frag_end) {
>> +                /* figure out the next fragment */
>> +                amdgpu_vm_fragment(params, frag_start, end,
>> +                           flags, &frag, &frag_end);
>> +                if (frag < shift)
>> +                    break;
>> +            }
>> +        } while (frag_start < entry_end);
>>   -        dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
>> -        start = frag_end;
>> +        if (frag >= shift)
>> +            amdgpu_vm_pt_next(adev, &cursor);
>>       }
>>         return 0;
>> @@ -1708,8 +1735,8 @@ static int amdgpu_vm_bo_update_mapping(struct 
>> amdgpu_device *adev,
>>             params.func = amdgpu_vm_cpu_set_ptes;
>>           params.pages_addr = pages_addr;
>> -        return amdgpu_vm_frag_ptes(&params, start, last + 1,
>> -                       addr, flags);
>> +        return amdgpu_vm_update_ptes(&params, start, last + 1,
>> +                         addr, flags);
>>       }
>>         ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, 
>> sched);
>> @@ -1788,7 +1815,7 @@ static int amdgpu_vm_bo_update_mapping(struct 
>> amdgpu_device *adev,
>>       if (r)
>>           goto error_free;
>>   -    r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
>> +    r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags);
>>       if (r)
>>           goto error_free;
>>