[PATCH v2] drm/amdgpu: Unlocked unmap only clear page table leaves

Fri Feb 7 14:58:34 UTC 2025

Am 07.02.25 um 15:53 schrieb Philip Yang:
> On 2025-02-07 05:17, Christian König wrote:
>> Am 30.01.25 um 17:19 schrieb Philip Yang:
>>> On 2025-01-29 11:40, Christian König wrote:
>>>> Am 23.01.25 um 21:39 schrieb Philip Yang:
>>>>> SVM migration unmap pages from GPU and then update mapping to GPU to
>>>>> recover page fault. Currently unmap clears the PDE entry for range
>>>>> length >= huge page and may free PTB bo. Then update mapping should
>>>>> alloc new PT bo, but there is race bug that the freed entry bo maybe
>>>>> still on the pt_free list, reused when updating mapping and then 
>>>>> freed,
>>>>> leave invalid PDE entry and cause GPU page fault.
>>>>>
>>>>> By setting the update fragment size to 2MB or 1GB, update will clear
>>>>> only one PDE entry or clear PTB, to avoid unmap to free PTE bo. This
>>>>> fixes the race bug and also improve the unmap and map to GPU
>>>>> performance. Update mapping to huge page will still free the PTB bo.
>>>>>
>>>>> With this change, the vm->pt_freed list and work is not needed. Add
>>>>> WARN_ON(unlocked) in amdgpu_vm_pt_add_list to catch if unmap to 
>>>>> free the
>>>>> PTB.
>>>>>
>>>>> v2: Limit update fragment size, not hack entry_end (Christian)
>>>>>
>>>>> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  4 --
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  4 --
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 54 
>>>>> +++++++++--------------
>>>>>   3 files changed, 21 insertions(+), 41 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> index c9c48b782ec1..48b2c0b3b315 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> @@ -2440,8 +2440,6 @@ int amdgpu_vm_init(struct amdgpu_device 
>>>>> *adev, struct amdgpu_vm *vm,
>>>>>       spin_lock_init(&vm->status_lock);
>>>>>       INIT_LIST_HEAD(&vm->freed);
>>>>>       INIT_LIST_HEAD(&vm->done);
>>>>> -    INIT_LIST_HEAD(&vm->pt_freed);
>>>>> -    INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
>>>>>       INIT_KFIFO(vm->faults);
>>>>>         r = amdgpu_vm_init_entities(adev, vm);
>>>>> @@ -2613,8 +2611,6 @@ void amdgpu_vm_fini(struct amdgpu_device 
>>>>> *adev, struct amdgpu_vm *vm)
>>>>>         amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
>>>>>   -    flush_work(&vm->pt_free_work);
>>>>> -
>>>>>       root = amdgpu_bo_ref(vm->root.bo);
>>>>>       amdgpu_bo_reserve(root, true);
>>>>>       amdgpu_vm_put_task_info(vm->task_info);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>> index 5d119ac26c4f..160889e5e64d 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>>> @@ -369,10 +369,6 @@ struct amdgpu_vm {
>>>>>       /* BOs which are invalidated, has been updated in the PTs */
>>>>>       struct list_head        done;
>>>>>   -    /* PT BOs scheduled to free and fill with zero if vm_resv 
>>>>> is not hold */
>>>>> -    struct list_head    pt_freed;
>>>>> -    struct work_struct    pt_free_work;
>>>>> -
>>>>>       /* contains the page directory */
>>>>>       struct amdgpu_vm_bo_base     root;
>>>>>       struct dma_fence    *last_update;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>>>> index f78a0434a48f..063d0e0a9f29 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>>>> @@ -546,27 +546,6 @@ static void amdgpu_vm_pt_free(struct 
>>>>> amdgpu_vm_bo_base *entry)
>>>>>       amdgpu_bo_unref(&entry->bo);
>>>>>   }
>>>>>   -void amdgpu_vm_pt_free_work(struct work_struct *work)
>>>>> -{
>>>>> -    struct amdgpu_vm_bo_base *entry, *next;
>>>>> -    struct amdgpu_vm *vm;
>>>>> -    LIST_HEAD(pt_freed);
>>>>> -
>>>>> -    vm = container_of(work, struct amdgpu_vm, pt_free_work);
>>>>> -
>>>>> -    spin_lock(&vm->status_lock);
>>>>> -    list_splice_init(&vm->pt_freed, &pt_freed);
>>>>> -    spin_unlock(&vm->status_lock);
>>>>> -
>>>>> -    /* flush_work in amdgpu_vm_fini ensure vm->root.bo is valid. */
>>>>> -    amdgpu_bo_reserve(vm->root.bo, true);
>>>>> -
>>>>> -    list_for_each_entry_safe(entry, next, &pt_freed, vm_status)
>>>>> -        amdgpu_vm_pt_free(entry);
>>>>> -
>>>>> -    amdgpu_bo_unreserve(vm->root.bo);
>>>>> -}
>>>>> -
>>>>>   /**
>>>>>    * amdgpu_vm_pt_free_list - free PD/PT levels
>>>>>    *
>>>>> @@ -579,20 +558,9 @@ void amdgpu_vm_pt_free_list(struct 
>>>>> amdgpu_device *adev,
>>>>>                   struct amdgpu_vm_update_params *params)
>>>>>   {
>>>>>       struct amdgpu_vm_bo_base *entry, *next;
>>>>> -    struct amdgpu_vm *vm = params->vm;
>>>>> -    bool unlocked = params->unlocked;
>>>>>         if (list_empty(&params->tlb_flush_waitlist))
>>>>>           return;
>>>>> -
>>>>> -    if (unlocked) {
>>>>> -        spin_lock(&vm->status_lock);
>>>>> - list_splice_init(&params->tlb_flush_waitlist, &vm->pt_freed);
>>>>> -        spin_unlock(&vm->status_lock);
>>>>> -        schedule_work(&vm->pt_free_work);
>>>>> -        return;
>>>>> -    }
>>>>> -
>>>>>       list_for_each_entry_safe(entry, next, 
>>>>> &params->tlb_flush_waitlist, vm_status)
>>>>>           amdgpu_vm_pt_free(entry);
>>>>>   }
>>>>> @@ -611,6 +579,11 @@ static void amdgpu_vm_pt_add_list(struct 
>>>>> amdgpu_vm_update_params *params,
>>>>>       struct amdgpu_vm_pt_cursor seek;
>>>>>       struct amdgpu_vm_bo_base *entry;
>>>>>   +    /*
>>>>> +     * unlocked unmap only clear page table leaves, warning to 
>>>>> free the page table entry.
>>>>> +     */
>>>>> +    WARN_ON(params->unlocked);
>>>>> +
>>>>>       spin_lock(&params->vm->status_lock);
>>>>>       for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, 
>>>>> cursor, seek, entry) {
>>>>>           if (entry && entry->bo)
>>>>> @@ -794,6 +767,21 @@ static void amdgpu_vm_pte_fragment(struct 
>>>>> amdgpu_vm_update_params *params,
>>>>>         /* This intentionally wraps around if no bit is set */
>>>>>       *frag = min_t(unsigned int, ffs(start) - 1, fls64(end - 
>>>>> start) - 1);
>>>>> +
>>>>> +    /*
>>>>> +     * MMU notifier callback unlocked unmap only clear PDE or PTE 
>>>>> leaves by setting fragment
>>>>> +     * size to 2MB, 1GB, 512GB. If leave is PDE entry, only clear 
>>>>> one entry, next fragment entry
>>>>> +     * will search again for PDE or PTE leaves.
>>>>> +     */
>>>>> +    if (params->unlocked) {
>>>>> +        if (*frag > 9 && *frag < 18)
>>>>> +            *frag = 9;
>>>>> +        else if (*frag > 18 && *frag < 27)
>>>>> +            *frag = 18;
>>>>> +        else if (*frag > 27)
>>>>> +            *frag = 27;
>>>>> +    }
>>>>> +
>>>>
>>>> That here looks unnecessary, the higher level handling is able to 
>>>> pick the right PD/PT level based on the fragment anyway.
>>>
>>> Thanks for the review, it is very hard to image this case, update 
>>> PDE0 entries, then free PTB bos, as this works fine for locked mapping.
>>>
>>
>> Yeah, I also need up to an hour to swap that logic back into my head 
>> every time I need to take a look at it. Understanding the VM stuff in 
>> both HW and SW is really not easy.
>>
>>> For unlocked unmapping,  after updating multiple PDE0 entries, it is 
>>> incorrect to free PTB bo if there is non-huge page mapping,
>>>
>>>
>>> For example, below is debug log to unmap d4000 to d4806, address 
>>> d4000, d4200 huge page mapping, d4400 not huge page mapping.
>>>
>>> If using fragment 11, it trigger the bug. No issue if we limit the 
>>> fragment to 9
>>>
>>
>> Ah! Ok, I see what you try to solve here.
>>
>> In that case we should probably indeed use a separate function. Since 
>> using the level to determine where to update stuff is then 
>> fundamentally wrong.
>
> yes, fragment size is not needed for unmap, we could optimize the 
> unmap using a separate function for both unlocked unmapping and locked 
> unmapping case, for example 7 pages unmap with address aligned to 4 
> pages, currently we unmap 4 pages, 2 pages, then 1 page, instead we 
> could unmap 7 pages together. But this cannot solve this race 
> condition issue, with unlocked mapping we still need only update leaves.
>
> I think same function for map and unmap is easy to maintain and 
> understand as unmap is just map update with zero PTE.
>
>>
>> In other words even if your round down the fragment size to a 
>> multiple of 9 you can still run into issues when the range which is 
>> unmapped is larger than 1GiB.
>>
>> E.g. you then have frag=18, but would eventually need frag=9 to not 
>> start freeing the 2MiB page tables.
>
> rescan page table after each fragment handling can get correct frag 
> with correct PDE leave or PTB leave, for example, unmap 4GB range, 
> address aligned to 1GB, with first 1GB physical contiguous mapping on 
> PDE1, then 4KB pages on PTE, first we get frag=18 (not frag=20 for 
> 4GB), then the unmap steps:
>
> a. level 1, shift 18, frag 18, update one entry on PDE1
>
> b. rescan and amdgpu_vm_pt_descendant will stop with level 3, shift 0, 
> PTE (frag is still 18, but not used)
>
> c. entry_end is 2MB, clean 512 entries on PTE,
>

Yeah and when one of those 512 entries pointed to a 4k region you end up 
in a messed up state again because you try to free this 4k region after 
the 512 entry update.

This approach here won't work either. You really need a separate 
function which doesn't use the frag size to determine what level of page 
tables to update.

Regards,
Christian.

> d, next frag is updated to 9, and go to step b to rescan, found PDE0 
> entry if huge page 2MB mapping, or PTE for 4KB mapping
>
> Regards,
>
> Philip
>
>
>>
>> Regards,
>> Christian.
>>
>>
>>>
>>> [  192.084456] amdgpu: 4 cursor pfn 0x7f87d4000 level 3 shift 0 
>>> frag_start 0x7f87d4000 frag_end 0x7f87d4800 frag 11
>>>
>>> move cursor up to walk parent node, as this is huge page mapping, no 
>>> PTB bo
>>>
>>> [  192.084463] amdgpu: 5 cursor pfn 0x7f87d4000 level 2 shift 9 
>>> frag_start 0x7f87d4000 frag_end 0x7f87d4800 frag 11
>>>
>>> clean mapping on PDE0, d4000, d4200, d4400, d4600
>>>
>>> [  192.084480] amdgpu: 7 cursor pfn 0x7f87d4000 level 2 shift 9 
>>> frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
>>>
>>> Then if (amdgpu_vm_pt_descendant(adev, &cursor)) is true,
>>>
>>> [  192.084487] amdgpu: 8 cursor pfn 0x7f87d4000 level 3 shift 9 
>>> frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
>>>
>>> This is fine, no PTB bo
>>>
>>> [  192.084494] amdgpu: 9 cursor pfn 0x7f87d4200 level 3 shift 9 
>>> frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
>>>
>>> This is fine, no PTB bo
>>>
>>> [  192.084501] amdgpu: 9 cursor pfn 0x7f87d4400 level 3 shift 9 
>>> frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
>>>
>>> PTB bo found, trigger the WARNING in this patch, to free PTB bo.
>>>
>>>
>>> [  192.084525] ------------[ cut here ]------------
>>> [  192.084531] WARNING: CPU: 9 PID: 3249 at 
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c:585 
>>> amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
>>> [  192.084854] Modules linked in: nf_tables nfnetlink ast 
>>> drm_shmem_helper k10temp msr fuse ip_tables x_tables amdgpu amdxcp 
>>> drm_client_lib drm_ttm_helper ttm drm_exec gpu_sched 
>>> drm_suballoc_helper video wmi cec drm_buddy drm_display_helper 
>>> drm_kms_helper drm drm_panel_orientation_quirks i2c_piix4
>>> [  192.084987] CPU: 9 UID: 1000 PID: 3249 Comm: kfdtest Tainted: 
>>> G        W          6.12.0-kfd-yangp #146
>>> [  192.084997] Tainted: [W]=WARN
>>> [  192.085004] Hardware name: GIGABYTE MZ01-CE0-00/MZ01-CE0-00, BIOS 
>>> F12 08/05/2019
>>> [  192.085011] RIP: 0010:amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
>>> [  192.085316] Code: 24 4c 8b 54 24 78 4d 89 f8 48 c7 c7 b0 83 0a c1 
>>> 4c 8b 4c 24 50 50 8b 4c 24 10 e8 8f bc b9 d2 48 8b 74 24 68 5a e9 70 
>>> fe ff ff <0f> 0b e9 92 fe ff ff 48 bd 00 00 00 00 01 00 00 00 ba ff 
>>> ff ff ff
>>> [  192.085325] RSP: 0018:ffffbed102bf78b8 EFLAGS: 00010202
>>> [  192.085336] RAX: ffff9f04b654a1f8 RBX: 0000000000000009 RCX: 
>>> 0000000000000000
>>> [  192.085344] RDX: 0000000000000002 RSI: 00000007f87d4400 RDI: 
>>> ffff9f0af6f9cac8
>>> [  192.085351] RBP: 00000007f87d4806 R08: 0000000000000000 R09: 
>>> c0000000ff7fffff
>>> [  192.085357] R10: 0000000000000001 R11: ffffbed102bf76e0 R12: 
>>> ffff9f046b100000
>>> [  192.085364] R13: 0000000000bf4000 R14: ffffbed102bf79e0 R15: 
>>> 00000007f87d4800
>>> [  192.085371] FS:  00007f87d5a515c0(0000) GS:ffff9f0af6f80000(0000) 
>>> knlGS:0000000000000000
>>> [  192.085379] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [  192.085386] CR2: 000055a190964b78 CR3: 0000000163fca000 CR4: 
>>> 00000000003506f0
>>> [  192.085393] Call Trace:
>>> [  192.085400]  <TASK>
>>> [  192.085408]  ? __warn+0x90/0x190
>>> [  192.085422]  ? amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
>>> [  192.085832]  ? report_bug+0xfc/0x1e0
>>> [  192.085849]  ? handle_bug+0x55/0x90
>>> [  192.085860]  ? exc_invalid_op+0x17/0x70
>>> [  192.085871]  ? asm_exc_invalid_op+0x1a/0x20
>>> [  192.085892]  ? amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
>>> [  192.086199]  ? srso_return_thunk+0x5/0x5f
>>> [  192.086216]  amdgpu_vm_update_range+0x242/0x850 [amdgpu]
>>> [  192.086537]  svm_range_unmap_from_gpus+0x117/0x300 [amdgpu]
>>> [  192.086906] svm_range_cpu_invalidate_pagetables+0x426/0x7a0 [amdgpu]
>>> [  192.087259]  ? lock_release+0xc8/0x290
>>> [  192.087276] __mmu_notifier_invalidate_range_start+0x233/0x2a0
>>> [  192.087292]  migrate_vma_setup+0x1a3/0x250
>>> [  192.087307]  svm_migrate_ram_to_vram+0x260/0x710 [amdgpu]
>>>
>>> Regards,
>>>
>>> Philip
>>>
>>>>
>>>> Apart from that the patch looks good to me.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>       if (*frag >= max_frag) {
>>>>>           *frag = max_frag;
>>>>>           *frag_end = end & ~((1ULL << max_frag) - 1);
>>>>> @@ -931,7 +919,7 @@ int amdgpu_vm_ptes_update(struct 
>>>>> amdgpu_vm_update_params *params,
>>>>>                   /* figure out the next fragment */
>>>>>                   amdgpu_vm_pte_fragment(params, frag_start, end,
>>>>>                                  flags, &frag, &frag_end);
>>>>> -                if (frag < shift)
>>>>> +                if (frag < shift || (params->unlocked && shift))
>>>>>                       break;
>>>>>               }
>>>>>           } while (frag_start < entry_end);
>>>>
>>