<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2025-02-11 05:34, Christian König
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:1d6c89dc-f032-4796-a038-c9e897e7bf1c@gmail.com">
      <br>
      <br>
      Am 20.01.25 um 16:59 schrieb Philip Yang:
      <br>
      <blockquote type="cite">
        <br>
        <br>
        On 2025-01-15 06:01, Christian König wrote:
        <br>
        <blockquote type="cite">Am 14.01.25 um 15:53 schrieb Philip
          Yang:
          <br>
          <blockquote type="cite">SVM migration unmap pages from GPU and
            then update mapping to GPU to
            <br>
            recover page fault. Currently unmap clears the PDE entry for
            range
            <br>
            length >= huge page and free PTB bo, update mapping to
            alloc new PT bo.
            <br>
            There is race bug that the freed entry bo maybe still on the
            pt_free
            <br>
            list, reused when updating mapping and then freed, leave
            invalid PDE
            <br>
            entry and cause GPU page fault.
            <br>
            <br>
            By setting the update to clear only one PDE entry or clear
            PTB, to
            <br>
            avoid unmap to free PTE bo. This fixes the race bug and
            improve the
            <br>
            unmap and map to GPU performance. Update mapping to huge
            page will
            <br>
            still free the PTB bo.
            <br>
            <br>
            With this change, the vm->pt_freed list and work is not
            needed. Add
            <br>
            WARN_ON(unlocked) in amdgpu_vm_pt_free_dfs to catch if unmap
            to free the
            <br>
            PTB.
            <br>
            <br>
            Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
            <br>
            ---
            <br>
              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  4 ---
            <br>
              drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  4 ---
            <br>
              drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 43
            +++++++----------------
            <br>
              3 files changed, 13 insertions(+), 38 deletions(-)
            <br>
            <br>
            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
            <br>
            index c9c48b782ec1..48b2c0b3b315 100644
            <br>
            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
            <br>
            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
            <br>
            @@ -2440,8 +2440,6 @@ int amdgpu_vm_init(struct
            amdgpu_device *adev, struct amdgpu_vm *vm,
            <br>
                  spin_lock_init(&vm->status_lock);
            <br>
                  INIT_LIST_HEAD(&vm->freed);
            <br>
                  INIT_LIST_HEAD(&vm->done);
            <br>
            -    INIT_LIST_HEAD(&vm->pt_freed);
            <br>
            -    INIT_WORK(&vm->pt_free_work,
            amdgpu_vm_pt_free_work);
            <br>
                  INIT_KFIFO(vm->faults);
            <br>
                    r = amdgpu_vm_init_entities(adev, vm);
            <br>
            @@ -2613,8 +2611,6 @@ void amdgpu_vm_fini(struct
            amdgpu_device *adev, struct amdgpu_vm *vm)
            <br>
                    amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
            <br>
              -    flush_work(&vm->pt_free_work);
            <br>
            -
            <br>
                  root = amdgpu_bo_ref(vm->root.bo);
            <br>
                  amdgpu_bo_reserve(root, true);
            <br>
                  amdgpu_vm_put_task_info(vm->task_info);
            <br>
            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
            <br>
            index 5d119ac26c4f..160889e5e64d 100644
            <br>
            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
            <br>
            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
            <br>
            @@ -369,10 +369,6 @@ struct amdgpu_vm {
            <br>
                  /* BOs which are invalidated, has been updated in the
            PTs */
            <br>
                  struct list_head        done;
            <br>
              -    /* PT BOs scheduled to free and fill with zero if
            vm_resv is not hold */
            <br>
            -    struct list_head    pt_freed;
            <br>
            -    struct work_struct    pt_free_work;
            <br>
            -
            <br>
                  /* contains the page directory */
            <br>
                  struct amdgpu_vm_bo_base     root;
            <br>
                  struct dma_fence    *last_update;
            <br>
            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
            <br>
            index f78a0434a48f..54ae0e9bc6d7 100644
            <br>
            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
            <br>
            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
            <br>
            @@ -546,27 +546,6 @@ static void amdgpu_vm_pt_free(struct
            amdgpu_vm_bo_base *entry)
            <br>
                  amdgpu_bo_unref(&entry->bo);
            <br>
              }
            <br>
              -void amdgpu_vm_pt_free_work(struct work_struct *work)
            <br>
            -{
            <br>
            -    struct amdgpu_vm_bo_base *entry, *next;
            <br>
            -    struct amdgpu_vm *vm;
            <br>
            -    LIST_HEAD(pt_freed);
            <br>
            -
            <br>
            -    vm = container_of(work, struct amdgpu_vm,
            pt_free_work);
            <br>
            -
            <br>
            -    spin_lock(&vm->status_lock);
            <br>
            -    list_splice_init(&vm->pt_freed, &pt_freed);
            <br>
            -    spin_unlock(&vm->status_lock);
            <br>
            -
            <br>
            -    /* flush_work in amdgpu_vm_fini ensure vm->root.bo
            is valid. */
            <br>
            -    amdgpu_bo_reserve(vm->root.bo, true);
            <br>
            -
            <br>
            -    list_for_each_entry_safe(entry, next, &pt_freed,
            vm_status)
            <br>
            -        amdgpu_vm_pt_free(entry);
            <br>
            -
            <br>
            -    amdgpu_bo_unreserve(vm->root.bo);
            <br>
            -}
            <br>
            -
            <br>
              /**
            <br>
               * amdgpu_vm_pt_free_list - free PD/PT levels
            <br>
               *
            <br>
            @@ -579,19 +558,15 @@ void amdgpu_vm_pt_free_list(struct
            amdgpu_device *adev,
            <br>
                              struct amdgpu_vm_update_params *params)
            <br>
              {
            <br>
                  struct amdgpu_vm_bo_base *entry, *next;
            <br>
            -    struct amdgpu_vm *vm = params->vm;
            <br>
                  bool unlocked = params->unlocked;
            <br>
                    if (list_empty(&params->tlb_flush_waitlist))
            <br>
                      return;
            <br>
              -    if (unlocked) {
            <br>
            -        spin_lock(&vm->status_lock);
            <br>
            -       
            list_splice_init(&params->tlb_flush_waitlist,
            &vm->pt_freed);
            <br>
            -        spin_unlock(&vm->status_lock);
            <br>
            -        schedule_work(&vm->pt_free_work);
            <br>
            -        return;
            <br>
            -    }
            <br>
            +    /*
            <br>
            +     * unlocked unmap clear page table leaves, warning to
            free the page entry.
            <br>
            +     */
            <br>
            +    WARN_ON(unlocked);
            <br>
                    list_for_each_entry_safe(entry, next,
            &params->tlb_flush_waitlist, vm_status)
            <br>
                      amdgpu_vm_pt_free(entry);
            <br>
            @@ -899,7 +874,15 @@ int amdgpu_vm_ptes_update(struct
            amdgpu_vm_update_params *params,
            <br>
                      incr = (uint64_t)AMDGPU_GPU_PAGE_SIZE <<
            shift;
            <br>
                      mask = amdgpu_vm_pt_entries_mask(adev,
            cursor.level);
            <br>
                      pe_start = ((cursor.pfn >> shift) &
            mask) * 8;
            <br>
            -        entry_end = ((uint64_t)mask + 1) << shift;
            <br>
            +
            <br>
            +        if (cursor.level < AMDGPU_VM_PTB &&
            params->unlocked)
            <br>
            +            /*
            <br>
            +             * MMU notifier callback unlocked unmap huge
            page, leave is PDE entry,
            <br>
            +             * only clear one entry. Next entry search
            again for PDE or PTE leave.
            <br>
            +             */
            <br>
            +            entry_end = 1ULL << shift;
            <br>
            +        else
            <br>
            +            entry_end = ((uint64_t)mask + 1) <<
            shift;
            <br>
          </blockquote>
          <br>
          That here looks to much like a hack for my taste. entry_end
          basically denotes the end of the pages tables and not the
          updated region.
          <br>
        </blockquote>
        yes, agree.
        <br>
      </blockquote>
      <br>
      After going back and forth over the different solution we found
      that we do need this hack for now.
      <br>
      <br>
      There is basically no other solution than to update one entry at a
      time without introducing a new function to do this.
      <br>
      <br>
      So feel free to add Reviewed-by: Christian König
      <a class="moz-txt-link-rfc2396E" href="mailto:christian.koenig@amd.com"><christian.koenig@amd.com></a> to this patch here, but please
      look into cleaning that up as soon as possible.
      <br>
    </blockquote>
    <p>Thanks for the review, I will merge this in to solve the issue
      for now, and implement new unlocked unmap function from scratch as
      you suggested to cleanup this and improve the performance.</p>
    <p>Regards,</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:1d6c89dc-f032-4796-a038-c9e897e7bf1c@gmail.com">
      <br>
      Thanks,
      <br>
      Christian.
      <br>
      <br>
      <blockquote type="cite">
        <blockquote type="cite">
          <br>
          We already set frag_end to this here:
          <br>
          <br>
          frag_end = max(frag_end, ALIGN(frag_start + 1, 1ULL <<
          shift));
          <br>
        </blockquote>
        <br>
        Here seems the root cause, for example, unmapping frag_start is
        8MB aligned address, frag_end is frag_start + 8MB, but for
        unlocked update we want to do page walk again after unmapping
        2MB.
        <br>
        <br>
        Also the max(....) seems meaningless, as frag_end is always
        >= frag_start + 1, this can be changed to
        <br>
        <br>
        if (params->unlocked)
        <br>
        <br>
           frag_end = frag_start + 1;
        <br>
        <br>
        <br>
        <blockquote type="cite">
          <br>
          Which basically means that we should update just one entry at
          the time and then walk the page tables again.
          <br>
          <br>
          The issue is just that we immediately calculate a new fragment
          after each update:
          <br>
          <br>
                                  if (frag_start >= frag_end) {
          <br>
                                          /* figure out the next
          fragment */
          <br>
                                          amdgpu_vm_pte_fragment(params,
          frag_start, end,
          <br>
                                                                 flags,
          &frag, &frag_end);
          <br>
                                          if (frag < shift)
          <br>
                                                  break;
          <br>
                                  }
          <br>
          And that looks like the place we need to adjust something to
          allow updates of the leave nodes.
          <br>
        </blockquote>
        <br>
        yes, also we should always break here to restart page walk for
        unlocked unmapping
        <br>
        <br>
        if (frag < shift || params->unlocked)
        <br>
        <br>
             break;
        <br>
        <br>
        <blockquote type="cite">
          <br>
          Alternatively I wouldn't mind having a complete separate
          function for unlocked invalidations.
          <br>
        </blockquote>
        <br>
        A complete separate function will duplicate lots of page walker
        code. Alternatively we could change amdgpu_vm_pte_fragment, for
        unlocked unmapping, always return frag_end to 2MB (or 1GB), not
        4MB, 8MB....
        <br>
        <br>
        Regards,
        <br>
        <br>
        Philip
        <br>
        <br>
        <blockquote type="cite">
          <br>
          Regards,
          <br>
          Christian.
          <br>
          <br>
          <br>
          <blockquote type="cite">          entry_end += cursor.pfn
            & ~(entry_end - 1);
            <br>
                      entry_end = min(entry_end, end);
            <br>
          </blockquote>
          <br>
        </blockquote>
      </blockquote>
      <br>
    </blockquote>
  </body>
</html>