[PATCH v2] drm/amdgpu: Unlocked unmap only clear page table leaves
Christian König
christian.koenig at amd.com
Fri Feb 7 10:17:10 UTC 2025
Am 30.01.25 um 17:19 schrieb Philip Yang:
> On 2025-01-29 11:40, Christian König wrote:
>> Am 23.01.25 um 21:39 schrieb Philip Yang:
>>> SVM migration unmap pages from GPU and then update mapping to GPU to
>>> recover page fault. Currently unmap clears the PDE entry for range
>>> length >= huge page and may free PTB bo. Then update mapping should
>>> alloc new PT bo, but there is race bug that the freed entry bo maybe
>>> still on the pt_free list, reused when updating mapping and then freed,
>>> leave invalid PDE entry and cause GPU page fault.
>>>
>>> By setting the update fragment size to 2MB or 1GB, update will clear
>>> only one PDE entry or clear PTB, to avoid unmap to free PTE bo. This
>>> fixes the race bug and also improve the unmap and map to GPU
>>> performance. Update mapping to huge page will still free the PTB bo.
>>>
>>> With this change, the vm->pt_freed list and work is not needed. Add
>>> WARN_ON(unlocked) in amdgpu_vm_pt_add_list to catch if unmap to free
>>> the
>>> PTB.
>>>
>>> v2: Limit update fragment size, not hack entry_end (Christian)
>>>
>>> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 --
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 --
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 54
>>> +++++++++--------------
>>> 3 files changed, 21 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index c9c48b782ec1..48b2c0b3b315 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -2440,8 +2440,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev,
>>> struct amdgpu_vm *vm,
>>> spin_lock_init(&vm->status_lock);
>>> INIT_LIST_HEAD(&vm->freed);
>>> INIT_LIST_HEAD(&vm->done);
>>> - INIT_LIST_HEAD(&vm->pt_freed);
>>> - INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
>>> INIT_KFIFO(vm->faults);
>>> r = amdgpu_vm_init_entities(adev, vm);
>>> @@ -2613,8 +2611,6 @@ void amdgpu_vm_fini(struct amdgpu_device
>>> *adev, struct amdgpu_vm *vm)
>>> amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
>>> - flush_work(&vm->pt_free_work);
>>> -
>>> root = amdgpu_bo_ref(vm->root.bo);
>>> amdgpu_bo_reserve(root, true);
>>> amdgpu_vm_put_task_info(vm->task_info);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> index 5d119ac26c4f..160889e5e64d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> @@ -369,10 +369,6 @@ struct amdgpu_vm {
>>> /* BOs which are invalidated, has been updated in the PTs */
>>> struct list_head done;
>>> - /* PT BOs scheduled to free and fill with zero if vm_resv is
>>> not hold */
>>> - struct list_head pt_freed;
>>> - struct work_struct pt_free_work;
>>> -
>>> /* contains the page directory */
>>> struct amdgpu_vm_bo_base root;
>>> struct dma_fence *last_update;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>> index f78a0434a48f..063d0e0a9f29 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>> @@ -546,27 +546,6 @@ static void amdgpu_vm_pt_free(struct
>>> amdgpu_vm_bo_base *entry)
>>> amdgpu_bo_unref(&entry->bo);
>>> }
>>> -void amdgpu_vm_pt_free_work(struct work_struct *work)
>>> -{
>>> - struct amdgpu_vm_bo_base *entry, *next;
>>> - struct amdgpu_vm *vm;
>>> - LIST_HEAD(pt_freed);
>>> -
>>> - vm = container_of(work, struct amdgpu_vm, pt_free_work);
>>> -
>>> - spin_lock(&vm->status_lock);
>>> - list_splice_init(&vm->pt_freed, &pt_freed);
>>> - spin_unlock(&vm->status_lock);
>>> -
>>> - /* flush_work in amdgpu_vm_fini ensure vm->root.bo is valid. */
>>> - amdgpu_bo_reserve(vm->root.bo, true);
>>> -
>>> - list_for_each_entry_safe(entry, next, &pt_freed, vm_status)
>>> - amdgpu_vm_pt_free(entry);
>>> -
>>> - amdgpu_bo_unreserve(vm->root.bo);
>>> -}
>>> -
>>> /**
>>> * amdgpu_vm_pt_free_list - free PD/PT levels
>>> *
>>> @@ -579,20 +558,9 @@ void amdgpu_vm_pt_free_list(struct
>>> amdgpu_device *adev,
>>> struct amdgpu_vm_update_params *params)
>>> {
>>> struct amdgpu_vm_bo_base *entry, *next;
>>> - struct amdgpu_vm *vm = params->vm;
>>> - bool unlocked = params->unlocked;
>>> if (list_empty(¶ms->tlb_flush_waitlist))
>>> return;
>>> -
>>> - if (unlocked) {
>>> - spin_lock(&vm->status_lock);
>>> - list_splice_init(¶ms->tlb_flush_waitlist, &vm->pt_freed);
>>> - spin_unlock(&vm->status_lock);
>>> - schedule_work(&vm->pt_free_work);
>>> - return;
>>> - }
>>> -
>>> list_for_each_entry_safe(entry, next,
>>> ¶ms->tlb_flush_waitlist, vm_status)
>>> amdgpu_vm_pt_free(entry);
>>> }
>>> @@ -611,6 +579,11 @@ static void amdgpu_vm_pt_add_list(struct
>>> amdgpu_vm_update_params *params,
>>> struct amdgpu_vm_pt_cursor seek;
>>> struct amdgpu_vm_bo_base *entry;
>>> + /*
>>> + * unlocked unmap only clear page table leaves, warning to free
>>> the page table entry.
>>> + */
>>> + WARN_ON(params->unlocked);
>>> +
>>> spin_lock(¶ms->vm->status_lock);
>>> for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm,
>>> cursor, seek, entry) {
>>> if (entry && entry->bo)
>>> @@ -794,6 +767,21 @@ static void amdgpu_vm_pte_fragment(struct
>>> amdgpu_vm_update_params *params,
>>> /* This intentionally wraps around if no bit is set */
>>> *frag = min_t(unsigned int, ffs(start) - 1, fls64(end - start)
>>> - 1);
>>> +
>>> + /*
>>> + * MMU notifier callback unlocked unmap only clear PDE or PTE
>>> leaves by setting fragment
>>> + * size to 2MB, 1GB, 512GB. If leave is PDE entry, only clear
>>> one entry, next fragment entry
>>> + * will search again for PDE or PTE leaves.
>>> + */
>>> + if (params->unlocked) {
>>> + if (*frag > 9 && *frag < 18)
>>> + *frag = 9;
>>> + else if (*frag > 18 && *frag < 27)
>>> + *frag = 18;
>>> + else if (*frag > 27)
>>> + *frag = 27;
>>> + }
>>> +
>>
>> That here looks unnecessary, the higher level handling is able to
>> pick the right PD/PT level based on the fragment anyway.
>
> Thanks for the review, it is very hard to image this case, update PDE0
> entries, then free PTB bos, as this works fine for locked mapping.
>
Yeah, I also need up to an hour to swap that logic back into my head
every time I need to take a look at it. Understanding the VM stuff in
both HW and SW is really not easy.
> For unlocked unmapping, after updating multiple PDE0 entries, it is
> incorrect to free PTB bo if there is non-huge page mapping,
>
>
> For example, below is debug log to unmap d4000 to d4806, address
> d4000, d4200 huge page mapping, d4400 not huge page mapping.
>
> If using fragment 11, it trigger the bug. No issue if we limit the
> fragment to 9
>
Ah! Ok, I see what you try to solve here.
In that case we should probably indeed use a separate function. Since
using the level to determine where to update stuff is then fundamentally
wrong.
In other words even if your round down the fragment size to a multiple
of 9 you can still run into issues when the range which is unmapped is
larger than 1GiB.
E.g. you then have frag=18, but would eventually need frag=9 to not
start freeing the 2MiB page tables.
Regards,
Christian.
>
> [ 192.084456] amdgpu: 4 cursor pfn 0x7f87d4000 level 3 shift 0
> frag_start 0x7f87d4000 frag_end 0x7f87d4800 frag 11
>
> move cursor up to walk parent node, as this is huge page mapping, no
> PTB bo
>
> [ 192.084463] amdgpu: 5 cursor pfn 0x7f87d4000 level 2 shift 9
> frag_start 0x7f87d4000 frag_end 0x7f87d4800 frag 11
>
> clean mapping on PDE0, d4000, d4200, d4400, d4600
>
> [ 192.084480] amdgpu: 7 cursor pfn 0x7f87d4000 level 2 shift 9
> frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
>
> Then if (amdgpu_vm_pt_descendant(adev, &cursor)) is true,
>
> [ 192.084487] amdgpu: 8 cursor pfn 0x7f87d4000 level 3 shift 9
> frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
>
> This is fine, no PTB bo
>
> [ 192.084494] amdgpu: 9 cursor pfn 0x7f87d4200 level 3 shift 9
> frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
>
> This is fine, no PTB bo
>
> [ 192.084501] amdgpu: 9 cursor pfn 0x7f87d4400 level 3 shift 9
> frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
>
> PTB bo found, trigger the WARNING in this patch, to free PTB bo.
>
>
> [ 192.084525] ------------[ cut here ]------------
> [ 192.084531] WARNING: CPU: 9 PID: 3249 at
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c:585
> amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
> [ 192.084854] Modules linked in: nf_tables nfnetlink ast
> drm_shmem_helper k10temp msr fuse ip_tables x_tables amdgpu amdxcp
> drm_client_lib drm_ttm_helper ttm drm_exec gpu_sched
> drm_suballoc_helper video wmi cec drm_buddy drm_display_helper
> drm_kms_helper drm drm_panel_orientation_quirks i2c_piix4
> [ 192.084987] CPU: 9 UID: 1000 PID: 3249 Comm: kfdtest Tainted:
> G W 6.12.0-kfd-yangp #146
> [ 192.084997] Tainted: [W]=WARN
> [ 192.085004] Hardware name: GIGABYTE MZ01-CE0-00/MZ01-CE0-00, BIOS
> F12 08/05/2019
> [ 192.085011] RIP: 0010:amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
> [ 192.085316] Code: 24 4c 8b 54 24 78 4d 89 f8 48 c7 c7 b0 83 0a c1
> 4c 8b 4c 24 50 50 8b 4c 24 10 e8 8f bc b9 d2 48 8b 74 24 68 5a e9 70
> fe ff ff <0f> 0b e9 92 fe ff ff 48 bd 00 00 00 00 01 00 00 00 ba ff ff
> ff ff
> [ 192.085325] RSP: 0018:ffffbed102bf78b8 EFLAGS: 00010202
> [ 192.085336] RAX: ffff9f04b654a1f8 RBX: 0000000000000009 RCX:
> 0000000000000000
> [ 192.085344] RDX: 0000000000000002 RSI: 00000007f87d4400 RDI:
> ffff9f0af6f9cac8
> [ 192.085351] RBP: 00000007f87d4806 R08: 0000000000000000 R09:
> c0000000ff7fffff
> [ 192.085357] R10: 0000000000000001 R11: ffffbed102bf76e0 R12:
> ffff9f046b100000
> [ 192.085364] R13: 0000000000bf4000 R14: ffffbed102bf79e0 R15:
> 00000007f87d4800
> [ 192.085371] FS: 00007f87d5a515c0(0000) GS:ffff9f0af6f80000(0000)
> knlGS:0000000000000000
> [ 192.085379] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 192.085386] CR2: 000055a190964b78 CR3: 0000000163fca000 CR4:
> 00000000003506f0
> [ 192.085393] Call Trace:
> [ 192.085400] <TASK>
> [ 192.085408] ? __warn+0x90/0x190
> [ 192.085422] ? amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
> [ 192.085832] ? report_bug+0xfc/0x1e0
> [ 192.085849] ? handle_bug+0x55/0x90
> [ 192.085860] ? exc_invalid_op+0x17/0x70
> [ 192.085871] ? asm_exc_invalid_op+0x1a/0x20
> [ 192.085892] ? amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
> [ 192.086199] ? srso_return_thunk+0x5/0x5f
> [ 192.086216] amdgpu_vm_update_range+0x242/0x850 [amdgpu]
> [ 192.086537] svm_range_unmap_from_gpus+0x117/0x300 [amdgpu]
> [ 192.086906] svm_range_cpu_invalidate_pagetables+0x426/0x7a0 [amdgpu]
> [ 192.087259] ? lock_release+0xc8/0x290
> [ 192.087276] __mmu_notifier_invalidate_range_start+0x233/0x2a0
> [ 192.087292] migrate_vma_setup+0x1a3/0x250
> [ 192.087307] svm_migrate_ram_to_vram+0x260/0x710 [amdgpu]
>
> Regards,
>
> Philip
>
>>
>> Apart from that the patch looks good to me.
>>
>> Regards,
>> Christian.
>>
>>> if (*frag >= max_frag) {
>>> *frag = max_frag;
>>> *frag_end = end & ~((1ULL << max_frag) - 1);
>>> @@ -931,7 +919,7 @@ int amdgpu_vm_ptes_update(struct
>>> amdgpu_vm_update_params *params,
>>> /* figure out the next fragment */
>>> amdgpu_vm_pte_fragment(params, frag_start, end,
>>> flags, &frag, &frag_end);
>>> - if (frag < shift)
>>> + if (frag < shift || (params->unlocked && shift))
>>> break;
>>> }
>>> } while (frag_start < entry_end);
>>
More information about the amd-gfx
mailing list