<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2025-02-07 05:17, Christian König
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:328ee3fb-ff15-4a79-aedb-376c68df79a0@amd.com">Am
      30.01.25 um 17:19 schrieb Philip Yang:
      <br>
      <blockquote type="cite">On 2025-01-29 11:40, Christian König
        wrote:
        <br>
        <blockquote type="cite">Am 23.01.25 um 21:39 schrieb Philip
          Yang:
          <br>
          <blockquote type="cite">SVM migration unmap pages from GPU and
            then update mapping to GPU to
            <br>
            recover page fault. Currently unmap clears the PDE entry for
            range
            <br>
            length >= huge page and may free PTB bo. Then update
            mapping should
            <br>
            alloc new PT bo, but there is race bug that the freed entry
            bo maybe
            <br>
            still on the pt_free list, reused when updating mapping and
            then freed,
            <br>
            leave invalid PDE entry and cause GPU page fault.
            <br>
            <br>
            By setting the update fragment size to 2MB or 1GB, update
            will clear
            <br>
            only one PDE entry or clear PTB, to avoid unmap to free PTE
            bo. This
            <br>
            fixes the race bug and also improve the unmap and map to GPU
            <br>
            performance. Update mapping to huge page will still free the
            PTB bo.
            <br>
            <br>
            With this change, the vm->pt_freed list and work is not
            needed. Add
            <br>
            WARN_ON(unlocked) in amdgpu_vm_pt_add_list to catch if unmap
            to free the
            <br>
            PTB.
            <br>
            <br>
            v2: Limit update fragment size, not hack entry_end
            (Christian)
            <br>
            <br>
            Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
            <br>
            ---
            <br>
              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  4 --
            <br>
              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  4 --
            <br>
              drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 54
            +++++++++--------------
            <br>
              3 files changed, 21 insertions(+), 41 deletions(-)
            <br>
            <br>
            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
            <br>
            index c9c48b782ec1..48b2c0b3b315 100644
            <br>
            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
            <br>
            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
            <br>
            @@ -2440,8 +2440,6 @@ int amdgpu_vm_init(struct
            amdgpu_device *adev, struct amdgpu_vm *vm,
            <br>
                  spin_lock_init(&vm->status_lock);
            <br>
                  INIT_LIST_HEAD(&vm->freed);
            <br>
                  INIT_LIST_HEAD(&vm->done);
            <br>
            -    INIT_LIST_HEAD(&vm->pt_freed);
            <br>
            -    INIT_WORK(&vm->pt_free_work,
            amdgpu_vm_pt_free_work);
            <br>
                  INIT_KFIFO(vm->faults);
            <br>
                    r = amdgpu_vm_init_entities(adev, vm);
            <br>
            @@ -2613,8 +2611,6 @@ void amdgpu_vm_fini(struct
            amdgpu_device *adev, struct amdgpu_vm *vm)
            <br>
                    amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
            <br>
              -    flush_work(&vm->pt_free_work);
            <br>
            -
            <br>
                  root = amdgpu_bo_ref(vm->root.bo);
            <br>
                  amdgpu_bo_reserve(root, true);
            <br>
                  amdgpu_vm_put_task_info(vm->task_info);
            <br>
            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
            <br>
            index 5d119ac26c4f..160889e5e64d 100644
            <br>
            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
            <br>
            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
            <br>
            @@ -369,10 +369,6 @@ struct amdgpu_vm {
            <br>
                  /* BOs which are invalidated, has been updated in the
            PTs */
            <br>
                  struct list_head        done;
            <br>
              -    /* PT BOs scheduled to free and fill with zero if
            vm_resv is not hold */
            <br>
            -    struct list_head    pt_freed;
            <br>
            -    struct work_struct    pt_free_work;
            <br>
            -
            <br>
                  /* contains the page directory */
            <br>
                  struct amdgpu_vm_bo_base     root;
            <br>
                  struct dma_fence    *last_update;
            <br>
            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
            <br>
            index f78a0434a48f..063d0e0a9f29 100644
            <br>
            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
            <br>
            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
            <br>
            @@ -546,27 +546,6 @@ static void amdgpu_vm_pt_free(struct
            amdgpu_vm_bo_base *entry)
            <br>
                  amdgpu_bo_unref(&entry->bo);
            <br>
              }
            <br>
              -void amdgpu_vm_pt_free_work(struct work_struct *work)
            <br>
            -{
            <br>
            -    struct amdgpu_vm_bo_base *entry, *next;
            <br>
            -    struct amdgpu_vm *vm;
            <br>
            -    LIST_HEAD(pt_freed);
            <br>
            -
            <br>
            -    vm = container_of(work, struct amdgpu_vm,
            pt_free_work);
            <br>
            -
            <br>
            -    spin_lock(&vm->status_lock);
            <br>
            -    list_splice_init(&vm->pt_freed, &pt_freed);
            <br>
            -    spin_unlock(&vm->status_lock);
            <br>
            -
            <br>
            -    /* flush_work in amdgpu_vm_fini ensure vm->root.bo
            is valid. */
            <br>
            -    amdgpu_bo_reserve(vm->root.bo, true);
            <br>
            -
            <br>
            -    list_for_each_entry_safe(entry, next, &pt_freed,
            vm_status)
            <br>
            -        amdgpu_vm_pt_free(entry);
            <br>
            -
            <br>
            -    amdgpu_bo_unreserve(vm->root.bo);
            <br>
            -}
            <br>
            -
            <br>
              /**
            <br>
               * amdgpu_vm_pt_free_list - free PD/PT levels
            <br>
               *
            <br>
            @@ -579,20 +558,9 @@ void amdgpu_vm_pt_free_list(struct
            amdgpu_device *adev,
            <br>
                              struct amdgpu_vm_update_params *params)
            <br>
              {
            <br>
                  struct amdgpu_vm_bo_base *entry, *next;
            <br>
            -    struct amdgpu_vm *vm = params->vm;
            <br>
            -    bool unlocked = params->unlocked;
            <br>
                    if (list_empty(&params->tlb_flush_waitlist))
            <br>
                      return;
            <br>
            -
            <br>
            -    if (unlocked) {
            <br>
            -        spin_lock(&vm->status_lock);
            <br>
            -       
            list_splice_init(&params->tlb_flush_waitlist,
            &vm->pt_freed);
            <br>
            -        spin_unlock(&vm->status_lock);
            <br>
            -        schedule_work(&vm->pt_free_work);
            <br>
            -        return;
            <br>
            -    }
            <br>
            -
            <br>
                  list_for_each_entry_safe(entry, next,
            &params->tlb_flush_waitlist, vm_status)
            <br>
                      amdgpu_vm_pt_free(entry);
            <br>
              }
            <br>
            @@ -611,6 +579,11 @@ static void
            amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params
            *params,
            <br>
                  struct amdgpu_vm_pt_cursor seek;
            <br>
                  struct amdgpu_vm_bo_base *entry;
            <br>
              +    /*
            <br>
            +     * unlocked unmap only clear page table leaves, warning
            to free the page table entry.
            <br>
            +     */
            <br>
            +    WARN_ON(params->unlocked);
            <br>
            +
            <br>
                  spin_lock(&params->vm->status_lock);
            <br>
                  for_each_amdgpu_vm_pt_dfs_safe(params->adev,
            params->vm, cursor, seek, entry) {
            <br>
                      if (entry && entry->bo)
            <br>
            @@ -794,6 +767,21 @@ static void
            amdgpu_vm_pte_fragment(struct amdgpu_vm_update_params
            *params,
            <br>
                    /* This intentionally wraps around if no bit is set
            */
            <br>
                  *frag = min_t(unsigned int, ffs(start) - 1, fls64(end
            - start) - 1);
            <br>
            +
            <br>
            +    /*
            <br>
            +     * MMU notifier callback unlocked unmap only clear PDE
            or PTE leaves by setting fragment
            <br>
            +     * size to 2MB, 1GB, 512GB. If leave is PDE entry, only
            clear one entry, next fragment entry
            <br>
            +     * will search again for PDE or PTE leaves.
            <br>
            +     */
            <br>
            +    if (params->unlocked) {
            <br>
            +        if (*frag > 9 && *frag < 18)
            <br>
            +            *frag = 9;
            <br>
            +        else if (*frag > 18 && *frag < 27)
            <br>
            +            *frag = 18;
            <br>
            +        else if (*frag > 27)
            <br>
            +            *frag = 27;
            <br>
            +    }
            <br>
            +
            <br>
          </blockquote>
          <br>
          That here looks unnecessary, the higher level handling is able
          to pick the right PD/PT level based on the fragment anyway.
          <br>
        </blockquote>
        <br>
        Thanks for the review, it is very hard to image this case,
        update PDE0 entries, then free PTB bos, as this works fine for
        locked mapping.
        <br>
        <br>
      </blockquote>
      <br>
      Yeah, I also need up to an hour to swap that logic back into my
      head every time I need to take a look at it. Understanding the VM
      stuff in both HW and SW is really not easy.
      <br>
      <br>
      <blockquote type="cite">For unlocked unmapping,  after updating
        multiple PDE0 entries, it is incorrect to free PTB bo if there
        is non-huge page mapping,
        <br>
        <br>
        <br>
        For example, below is debug log to unmap d4000 to d4806, address
        d4000, d4200 huge page mapping, d4400 not huge page mapping.
        <br>
        <br>
        If using fragment 11, it trigger the bug. No issue if we limit
        the fragment to 9
        <br>
        <br>
      </blockquote>
      <br>
      Ah! Ok, I see what you try to solve here.
      <br>
      <br>
      In that case we should probably indeed use a separate function.
      Since using the level to determine where to update stuff is then
      fundamentally wrong.
      <br>
    </blockquote>
    <p>yes, fragment size is not needed for unmap, we could optimize the
      unmap using a separate function for both unlocked unmapping and
      locked unmapping case, for example 7 pages unmap with address
      aligned to 4 pages, currently we unmap 4 pages, 2 pages, then 1
      page, instead we could unmap 7 pages together. But this cannot
      solve this race condition issue, with unlocked mapping we still
      need only update leaves.<br>
    </p>
    <p>I think same function for map and unmap is easy to maintain and
      understand as unmap is just map update with zero PTE.</p>
    <blockquote type="cite" cite="mid:328ee3fb-ff15-4a79-aedb-376c68df79a0@amd.com">
      <br>
      In other words even if your round down the fragment size to a
      multiple of 9 you can still run into issues when the range which
      is unmapped is larger than 1GiB.
      <br>
      <br>
      E.g. you then have frag=18, but would eventually need frag=9 to
      not start freeing the 2MiB page tables.
      <br>
    </blockquote>
    <p>rescan page table after each fragment handling can get correct
      frag with correct PDE leave or PTB leave, for example, unmap 4GB
      range, address aligned to 1GB, with first 1GB physical contiguous
      mapping on PDE1, then 4KB pages on PTE, first we get frag=18 (not
      frag=20 for 4GB), then the unmap steps:</p>
    <p>a. level 1, shift 18, frag 18, update one entry on PDE1</p>
    <p>b. rescan and amdgpu_vm_pt_descendant will stop with level 3,
      shift 0, PTE (frag is still 18, but not used)<br>
    </p>
    <p>c. entry_end is 2MB, clean 512 entries on PTE,</p>
    <p>d, next frag is updated to 9, and go to step b to rescan, found
      PDE0 entry if huge page 2MB mapping, or PTE for 4KB mapping</p>
    <p>Regards,</p>
    <p>Philip<br>
    </p>
    <br>
    <blockquote type="cite" cite="mid:328ee3fb-ff15-4a79-aedb-376c68df79a0@amd.com">
      <br>
      Regards,
      <br>
      Christian.
      <br>
      <br>
      <br>
      <blockquote type="cite">
        <br>
        [  192.084456] amdgpu: 4 cursor pfn 0x7f87d4000 level 3 shift 0
        frag_start 0x7f87d4000 frag_end 0x7f87d4800 frag 11
        <br>
        <br>
        move cursor up to walk parent node, as this is huge page
        mapping, no PTB bo
        <br>
        <br>
        [  192.084463] amdgpu: 5 cursor pfn 0x7f87d4000 level 2 shift 9
        frag_start 0x7f87d4000 frag_end 0x7f87d4800 frag 11
        <br>
        <br>
        clean mapping on PDE0, d4000, d4200, d4400, d4600
        <br>
        <br>
        [  192.084480] amdgpu: 7 cursor pfn 0x7f87d4000 level 2 shift 9
        frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
        <br>
        <br>
        Then if (amdgpu_vm_pt_descendant(adev, &cursor)) is true,
        <br>
        <br>
        [  192.084487] amdgpu: 8 cursor pfn 0x7f87d4000 level 3 shift 9
        frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
        <br>
        <br>
        This is fine, no PTB bo
        <br>
        <br>
        [  192.084494] amdgpu: 9 cursor pfn 0x7f87d4200 level 3 shift 9
        frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
        <br>
        <br>
        This is fine, no PTB bo
        <br>
        <br>
        [  192.084501] amdgpu: 9 cursor pfn 0x7f87d4400 level 3 shift 9
        frag_start 0x7f87d4800 frag_end 0x7f87d4804 frag 2
        <br>
        <br>
        PTB bo found, trigger the WARNING in this patch, to free PTB bo.
        <br>
        <br>
        <br>
        [  192.084525] ------------[ cut here ]------------
        <br>
        [  192.084531] WARNING: CPU: 9 PID: 3249 at
        drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c:585
        amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
        <br>
        [  192.084854] Modules linked in: nf_tables nfnetlink ast
        drm_shmem_helper k10temp msr fuse ip_tables x_tables amdgpu
        amdxcp drm_client_lib drm_ttm_helper ttm drm_exec gpu_sched
        drm_suballoc_helper video wmi cec drm_buddy drm_display_helper
        drm_kms_helper drm drm_panel_orientation_quirks i2c_piix4
        <br>
        [  192.084987] CPU: 9 UID: 1000 PID: 3249 Comm: kfdtest Tainted:
        G        W          6.12.0-kfd-yangp #146
        <br>
        [  192.084997] Tainted: [W]=WARN
        <br>
        [  192.085004] Hardware name: GIGABYTE MZ01-CE0-00/MZ01-CE0-00,
        BIOS F12 08/05/2019
        <br>
        [  192.085011] RIP: 0010:amdgpu_vm_ptes_update+0xa2c/0xbd0
        [amdgpu]
        <br>
        [  192.085316] Code: 24 4c 8b 54 24 78 4d 89 f8 48 c7 c7 b0 83
        0a c1 4c 8b 4c 24 50 50 8b 4c 24 10 e8 8f bc b9 d2 48 8b 74 24
        68 5a e9 70 fe ff ff <0f> 0b e9 92 fe ff ff 48 bd 00 00 00
        00 01 00 00 00 ba ff ff ff ff
        <br>
        [  192.085325] RSP: 0018:ffffbed102bf78b8 EFLAGS: 00010202
        <br>
        [  192.085336] RAX: ffff9f04b654a1f8 RBX: 0000000000000009 RCX:
        0000000000000000
        <br>
        [  192.085344] RDX: 0000000000000002 RSI: 00000007f87d4400 RDI:
        ffff9f0af6f9cac8
        <br>
        [  192.085351] RBP: 00000007f87d4806 R08: 0000000000000000 R09:
        c0000000ff7fffff
        <br>
        [  192.085357] R10: 0000000000000001 R11: ffffbed102bf76e0 R12:
        ffff9f046b100000
        <br>
        [  192.085364] R13: 0000000000bf4000 R14: ffffbed102bf79e0 R15:
        00000007f87d4800
        <br>
        [  192.085371] FS:  00007f87d5a515c0(0000)
        GS:ffff9f0af6f80000(0000) knlGS:0000000000000000
        <br>
        [  192.085379] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
        <br>
        [  192.085386] CR2: 000055a190964b78 CR3: 0000000163fca000 CR4:
        00000000003506f0
        <br>
        [  192.085393] Call Trace:
        <br>
        [  192.085400]  <TASK>
        <br>
        [  192.085408]  ? __warn+0x90/0x190
        <br>
        [  192.085422]  ? amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
        <br>
        [  192.085832]  ? report_bug+0xfc/0x1e0
        <br>
        [  192.085849]  ? handle_bug+0x55/0x90
        <br>
        [  192.085860]  ? exc_invalid_op+0x17/0x70
        <br>
        [  192.085871]  ? asm_exc_invalid_op+0x1a/0x20
        <br>
        [  192.085892]  ? amdgpu_vm_ptes_update+0xa2c/0xbd0 [amdgpu]
        <br>
        [  192.086199]  ? srso_return_thunk+0x5/0x5f
        <br>
        [  192.086216]  amdgpu_vm_update_range+0x242/0x850 [amdgpu]
        <br>
        [  192.086537]  svm_range_unmap_from_gpus+0x117/0x300 [amdgpu]
        <br>
        [  192.086906]  svm_range_cpu_invalidate_pagetables+0x426/0x7a0
        [amdgpu]
        <br>
        [  192.087259]  ? lock_release+0xc8/0x290
        <br>
        [  192.087276] __mmu_notifier_invalidate_range_start+0x233/0x2a0
        <br>
        [  192.087292]  migrate_vma_setup+0x1a3/0x250
        <br>
        [  192.087307]  svm_migrate_ram_to_vram+0x260/0x710 [amdgpu]
        <br>
        <br>
        Regards,
        <br>
        <br>
        Philip
        <br>
        <br>
        <blockquote type="cite">
          <br>
          Apart from that the patch looks good to me.
          <br>
          <br>
          Regards,
          <br>
          Christian.
          <br>
          <br>
          <blockquote type="cite">      if (*frag >= max_frag) {
            <br>
                      *frag = max_frag;
            <br>
                      *frag_end = end & ~((1ULL << max_frag) -
            1);
            <br>
            @@ -931,7 +919,7 @@ int amdgpu_vm_ptes_update(struct
            amdgpu_vm_update_params *params,
            <br>
                              /* figure out the next fragment */
            <br>
                              amdgpu_vm_pte_fragment(params, frag_start,
            end,
            <br>
                                             flags, &frag,
            &frag_end);
            <br>
            -                if (frag < shift)
            <br>
            +                if (frag < shift || (params->unlocked
            && shift))
            <br>
                                  break;
            <br>
                          }
            <br>
                      } while (frag_start < entry_end);
            <br>
          </blockquote>
          <br>
        </blockquote>
      </blockquote>
      <br>
    </blockquote>
  </body>
</html>