<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2021-11-17 7:10 p.m., Felix Kuehling
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:5e1927f0-2cbc-e639-57b1-f61274ec97f4@amd.com">On
      2021-11-16 10:43 p.m., Philip Yang wrote:
      <br>
      <blockquote type="cite">VMA may be removed before unmap notifier
        callback, restore pages take
        <br>
        mmap write lock to lookup VMA to avoid race,
        <br>
      </blockquote>
      <br>
      The old code looked up the VMA after taking the mmap lock (either
      read or write) and kept holding the lock afterwards. I think even
      with your new code it's possible that the VMA disappears before
      you take the lock the first time, so always taking the write lock
      only reduces the time window in which things can go wrong, but it
      doesn't remove the race.
      <br>
    </blockquote>
    Take mmap write lock will serialize with __do_munmap, to ensure vma
    remove and unmap callback are done, because unmap callback set
    drain_retryfaults flag, so we can safely drain the faults, and it is
    app bug if vma not found after taking mmap write lock.<br>
    <blockquote type="cite" cite="mid:5e1927f0-2cbc-e639-57b1-f61274ec97f4@amd.com">
      <br>
      I still struggle to understand the race you're trying to fix. The
      only time the svm_restore_pages can see that the VMA is gone AND
      the prange is gone is after the deferred worker has removed the
      prange. But the fault draining in the deferred worker should
      prevent us from ever seeing stale faults in that situation. That
      means, if no prange is found and no VMA is found, it's definitely
      an application bug.
      <br>
      <br>
      The only possible race is in the case where the prange still
      exists but the VMA is gone (needed by svm_fault_allowed). We can
      treat that as a special case where we just return success because
      we know that we're handling a stale fault for a VMA that's in the
      middle of being unmapped. The fact that the prange still existed
      means that there once was a valid mapping at the address but the
      deferred worker just hasn't had a chance to clean it up yet.
      <br>
      <br>
    </blockquote>
    Yes, that is only possible race.<br>
    <blockquote type="cite" cite="mid:5e1927f0-2cbc-e639-57b1-f61274ec97f4@amd.com">One more
      comment inline.
      <br>
      <br>
      <br>
      <blockquote type="cite">  and then create unregister
        <br>
        new range and check VMA access permission, then downgrade to
        take mmap
        <br>
        read lock to recover fault. Refactor code to avoid duplicate VMA
        lookup.
        <br>
        <br>
        Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
        <br>
        ---
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 65
        ++++++++++------------------
        <br>
          1 file changed, 24 insertions(+), 41 deletions(-)
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        index c1f367934428..3eb0a9491755 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        @@ -2329,20 +2329,13 @@ svm_range_best_restore_location(struct
        svm_range *prange,
        <br>
          }
        <br>
            static int
        <br>
        -svm_range_get_range_boundaries(struct kfd_process *p, int64_t
        addr,
        <br>
        -                   unsigned long *start, unsigned long *last,
        <br>
        -                   bool *is_heap_stack)
        <br>
        +svm_range_get_range_boundaries(struct kfd_process *p, struct
        vm_area_struct *vma,
        <br>
        +                   int64_t addr, unsigned long *start,
        <br>
        +                   unsigned long *last, bool *is_heap_stack)
        <br>
          {
        <br>
        -    struct vm_area_struct *vma;
        <br>
              struct interval_tree_node *node;
        <br>
              unsigned long start_limit, end_limit;
        <br>
          -    vma = find_vma(p->mm, addr << PAGE_SHIFT);
        <br>
        -    if (!vma || (addr << PAGE_SHIFT) <
        vma->vm_start) {
        <br>
        -        pr_debug("VMA does not exist in address [0x%llx]\n",
        addr);
        <br>
        -        return -EFAULT;
        <br>
        -    }
        <br>
        -
        <br>
              *is_heap_stack = (vma->vm_start <=
        vma->vm_mm->brk &&
        <br>
                        vma->vm_end >=
        vma->vm_mm->start_brk) ||
        <br>
                       (vma->vm_start <=
        vma->vm_mm->start_stack &&
        <br>
        @@ -2437,9 +2430,10 @@ svm_range_check_vm_userptr(struct
        kfd_process *p, uint64_t start, uint64_t last,
        <br>
            static struct
        <br>
          svm_range *svm_range_create_unregistered_range(struct
        amdgpu_device *adev,
        <br>
        -                        struct kfd_process *p,
        <br>
        -                        struct mm_struct *mm,
        <br>
        -                        int64_t addr)
        <br>
        +                           struct kfd_process *p,
        <br>
        +                           struct mm_struct *mm,
        <br>
        +                           struct vm_area_struct *vma,
        <br>
        +                           int64_t addr)
        <br>
          {
        <br>
              struct svm_range *prange = NULL;
        <br>
              unsigned long start, last;
        <br>
        @@ -2449,7 +2443,7 @@ svm_range
        *svm_range_create_unregistered_range(struct amdgpu_device *adev,
        <br>
              uint64_t bo_l = 0;
        <br>
              int r;
        <br>
          -    if (svm_range_get_range_boundaries(p, addr, &start,
        &last,
        <br>
        +    if (svm_range_get_range_boundaries(p, vma, addr,
        &start, &last,
        <br>
                                 &is_heap_stack))
        <br>
                  return NULL;
        <br>
          @@ -2552,20 +2546,13 @@ svm_range_count_fault(struct
        amdgpu_device *adev, struct kfd_process *p,
        <br>
          }
        <br>
            static bool
        <br>
        -svm_fault_allowed(struct mm_struct *mm, uint64_t addr, bool
        write_fault)
        <br>
        +svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
        <br>
          {
        <br>
              unsigned long requested = VM_READ;
        <br>
        -    struct vm_area_struct *vma;
        <br>
                if (write_fault)
        <br>
                  requested |= VM_WRITE;
        <br>
          -    vma = find_vma(mm, addr << PAGE_SHIFT);
        <br>
        -    if (!vma || (addr << PAGE_SHIFT) <
        vma->vm_start) {
        <br>
        -        pr_debug("address 0x%llx VMA is removed\n", addr);
        <br>
        -        return true;
        <br>
        -    }
        <br>
        -
        <br>
              pr_debug("requested 0x%lx, vma permission flags 0x%lx\n",
        requested,
        <br>
                  vma->vm_flags);
        <br>
              return (vma->vm_flags & requested) == requested;
        <br>
        @@ -2582,7 +2569,7 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
              uint64_t timestamp;
        <br>
              int32_t best_loc;
        <br>
              int32_t gpuidx = MAX_GPU_INSTANCE;
        <br>
        -    bool write_locked = false;
        <br>
        +    struct vm_area_struct *vma = NULL;
        <br>
              int r = 0;
        <br>
                if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
        <br>
        @@ -2606,26 +2593,22 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
                /* mm is available because kfd_process_notifier_release
        drain fault */
        <br>
              mm = p->mm;
        <br>
        +    mmap_write_lock(mm);
        <br>
      </blockquote>
      <br>
      Always taking the write lock is unnecessary. I think we can keep
      the old strategy of retrying with the write lock only when
      necessary. I think this should work correctly as long as you
      lookup the VMA every time after taking either the mmap read or
      write lock. The vma you looked up should be valid as long as you
      hold that lock.
      <br>
      <br>
      As I pointed out above, if svm_range_from_addr finds a prange but
      the VMA is missing, we can treat that as a special case and return
      success (just draining a stale fault on a VMA that's being
      unmapped).
      <br>
    </blockquote>
    <p>ok, I will change svm_fault_allowed to return success if VMA is
      missing, it is simpler to handle this special race case, without
      taking mmap write lock.</p>
    <p>Regards,</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:5e1927f0-2cbc-e639-57b1-f61274ec97f4@amd.com">
      <br>
      Regards,
      <br>
        Felix
      <br>
      <br>
      <br>
      <blockquote type="cite">+
        <br>
        +    vma = find_vma(p->mm, addr << PAGE_SHIFT);
        <br>
        +    if (!vma || (addr << PAGE_SHIFT) <
        vma->vm_start) {
        <br>
        +        pr_debug("VMA not found for address 0x%llx\n", addr);
        <br>
        +        mmap_write_downgrade(mm);
        <br>
        +        r = -EFAULT;
        <br>
        +        goto out_unlock_mm;
        <br>
        +    }
        <br>
          -    mmap_read_lock(mm);
        <br>
        -retry_write_locked:
        <br>
              mutex_lock(&svms->lock);
        <br>
              prange = svm_range_from_addr(svms, addr, NULL);
        <br>
              if (!prange) {
        <br>
                  pr_debug("failed to find prange svms 0x%p address
        [0x%llx]\n",
        <br>
                       svms, addr);
        <br>
        -        if (!write_locked) {
        <br>
        -            /* Need the write lock to create new range with MMU
        notifier.
        <br>
        -             * Also flush pending deferred work to make sure
        the interval
        <br>
        -             * tree is up to date before we add a new range
        <br>
        -             */
        <br>
        -            mutex_unlock(&svms->lock);
        <br>
        -            mmap_read_unlock(mm);
        <br>
        -            mmap_write_lock(mm);
        <br>
        -            write_locked = true;
        <br>
        -            goto retry_write_locked;
        <br>
        -        }
        <br>
        -        prange = svm_range_create_unregistered_range(adev, p,
        mm, addr);
        <br>
        +        prange = svm_range_create_unregistered_range(adev, p,
        mm, vma, addr);
        <br>
                  if (!prange) {
        <br>
                      pr_debug("failed to create unregistered range svms
        0x%p address [0x%llx]\n",
        <br>
                           svms, addr);
        <br>
        @@ -2634,8 +2617,8 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
                      goto out_unlock_svms;
        <br>
                  }
        <br>
              }
        <br>
        -    if (write_locked)
        <br>
        -        mmap_write_downgrade(mm);
        <br>
        +
        <br>
        +    mmap_write_downgrade(mm);
        <br>
                mutex_lock(&prange->migrate_mutex);
        <br>
          @@ -2652,7 +2635,7 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
                  goto out_unlock_range;
        <br>
              }
        <br>
          -    if (!svm_fault_allowed(mm, addr, write_fault)) {
        <br>
        +    if (!svm_fault_allowed(vma, write_fault)) {
        <br>
                  pr_debug("fault addr 0x%llx no %s permission\n", addr,
        <br>
                      write_fault ? "write" : "read");
        <br>
                  r = -EPERM;
        <br>
        @@ -2704,10 +2687,10 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
              mutex_unlock(&prange->migrate_mutex);
        <br>
          out_unlock_svms:
        <br>
              mutex_unlock(&svms->lock);
        <br>
        +out_unlock_mm:
        <br>
              mmap_read_unlock(mm);
        <br>
                svm_range_count_fault(adev, p, gpuidx);
        <br>
        -
        <br>
          out:
        <br>
              kfd_unref_process(p);
        <br>
          </blockquote>
    </blockquote>
  </body>
</html>