<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2021-11-17 6:18 p.m., Felix Kuehling
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:fb6b3750-56e4-8a79-c479-10c4cbbd3e4a@amd.com">On
      2021-11-16 10:43 p.m., Philip Yang wrote:
      <br>
      <blockquote type="cite">kfd process mmu release notifier callback
        drain retry fault to ensure no
        <br>
        retry fault comes after removing kfd process from the hash
        table,
        <br>
        otherwise svm page fault handler will fail to recover the fault
        and dump
        <br>
        GPU vm fault log.
        <br>
        <br>
        Drain retry fault needs flush restore page fault work to wait
        for
        <br>
        the last fault is handled because IH dispatch increase rptr
        first and
        <br>
        then calls restore_pages, so restore pages may still handle the
        last
        <br>
        fault but amdgpu_ih_has_checkpoint_processed return true.
        <br>
      </blockquote>
      <br>
      This fixes the problem, but it will result in waiting longer than
      necessary because the worker only finishes when the IH ring is
      empty.
      <br>
      <br>
    </blockquote>
    Working on new IH ring1 overflow patch to handle drain_retry_fault
    race, flush will not need here.<br>
    <blockquote type="cite" cite="mid:fb6b3750-56e4-8a79-c479-10c4cbbd3e4a@amd.com">
      <br>
      <blockquote type="cite">
        <br>
        restore pages can not call mmget because mmput may call mmu
        notifier
        <br>
        release to cause deadlock.
        <br>
      </blockquote>
      <br>
      See my comment inline.
      <br>
      <br>
      <br>
      <blockquote type="cite">
        <br>
        Refactor deferred list work to call mmget and take mmap write
        lock to
        <br>
        handle all ranges, to avoid mm is gone while inserting mmu
        notifier.
        <br>
        <br>
        Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
        <br>
        ---
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_process.c |  6 +++
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 69
        ++++++++++++------------
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_svm.h     |  1 +
        <br>
          3 files changed, 41 insertions(+), 35 deletions(-)
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        <br>
        index d4c8a6948a9f..8b4b045d5c92 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        <br>
        @@ -1143,6 +1143,12 @@ static void
        kfd_process_notifier_release(struct mmu_notifier *mn,
        <br>
              if (WARN_ON(p->mm != mm))
        <br>
                  return;
        <br>
          +    /*
        <br>
        +     * Ensure no retry fault comes in afterwards, as page fault
        handler will
        <br>
        +     * not find kfd process and take mm lock to recover fault.
        <br>
        +     */
        <br>
        +    svm_range_drain_retry_fault(&p->svms);
        <br>
        +
        <br>
              mutex_lock(&kfd_processes_mutex);
        <br>
              hash_del_rcu(&p->kfd_processes);
        <br>
              mutex_unlock(&kfd_processes_mutex);
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        index 88360f23eb61..c1f367934428 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        @@ -1953,9 +1953,10 @@ svm_range_handle_list_op(struct
        svm_range_list *svms, struct svm_range *prange)
        <br>
              }
        <br>
          }
        <br>
          -static void svm_range_drain_retry_fault(struct svm_range_list
        *svms)
        <br>
        +void svm_range_drain_retry_fault(struct svm_range_list *svms)
        <br>
          {
        <br>
              struct kfd_process_device *pdd;
        <br>
        +    struct amdgpu_device *adev;
        <br>
              struct kfd_process *p;
        <br>
              uint32_t i;
        <br>
          @@ -1967,9 +1968,11 @@ static void
        svm_range_drain_retry_fault(struct svm_range_list *svms)
        <br>
                      continue;
        <br>
                    pr_debug("drain retry fault gpu %d svms %p\n", i,
        svms);
        <br>
        +        adev = pdd->dev->adev;
        <br>
        +        amdgpu_ih_wait_on_checkpoint_process(adev,
        &adev->irq.ih1);
        <br>
          -       
        amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev,
        <br>
        -                            
        &pdd->dev->adev->irq.ih1);
        <br>
        +        /* Wait for the last page fault is handled */
        <br>
        +        flush_work(&adev->irq.ih1_work);
        <br>
                  pr_debug("drain retry fault gpu %d svms 0x%p done\n",
        i, svms);
        <br>
              }
        <br>
          }
        <br>
        @@ -1979,43 +1982,43 @@ static void
        svm_range_deferred_list_work(struct work_struct *work)
        <br>
              struct svm_range_list *svms;
        <br>
              struct svm_range *prange;
        <br>
              struct mm_struct *mm;
        <br>
        +    struct kfd_process *p;
        <br>
                svms = container_of(work, struct svm_range_list,
        deferred_list_work);
        <br>
              pr_debug("enter svms 0x%p\n", svms);
        <br>
          +    p = container_of(svms, struct kfd_process, svms);
        <br>
        +    mm = p->mm;
        <br>
        +
        <br>
        +    /* Take mm->mm_users to avoid mm is gone when inserting
        mmu notifier */
        <br>
        +    if (!mm || !mmget_not_zero(mm)) {
        <br>
      </blockquote>
      <br>
      get_task_mm would be safer than relying on p->mm. I regret ever
      adding that to the process structure.
      <br>
      <br>
    </blockquote>
    Will use get_task_mm(pdd->process->lead_thread), it is safer
    as we take task reference.<br>
    <blockquote type="cite" cite="mid:fb6b3750-56e4-8a79-c479-10c4cbbd3e4a@amd.com">
      <br>
      <blockquote type="cite">+        pr_debug("svms 0x%p process mm
        gone\n", svms);
        <br>
        +        return;
        <br>
        +    }
        <br>
        +retry:
        <br>
        +    mmap_write_lock(mm);
        <br>
        +
        <br>
        +    /* Checking for the need to drain retry faults must be
        inside
        <br>
        +     * mmap write lock to serialize with munmap notifiers.
        <br>
        +     */
        <br>
        +    if (unlikely(READ_ONCE(svms->drain_pagefaults))) {
        <br>
        +        WRITE_ONCE(svms->drain_pagefaults, false);
        <br>
        +        mmap_write_unlock(mm);
        <br>
        +        svm_range_drain_retry_fault(svms);
        <br>
        +        goto retry;
        <br>
        +    }
        <br>
        +
        <br>
              spin_lock(&svms->deferred_list_lock);
        <br>
              while (!list_empty(&svms->deferred_range_list)) {
        <br>
                  prange =
        list_first_entry(&svms->deferred_range_list,
        <br>
                                struct svm_range, deferred_list);
        <br>
        +        list_del_init(&prange->deferred_list);
        <br>
                  spin_unlock(&svms->deferred_list_lock);
        <br>
        +
        <br>
                  pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
        <br>
                       prange->start, prange->last,
        prange->work_item.op);
        <br>
          -        mm = prange->work_item.mm;
        <br>
        -retry:
        <br>
        -        mmap_write_lock(mm);
        <br>
                  mutex_lock(&svms->lock);
        <br>
        -
        <br>
        -        /* Checking for the need to drain retry faults must be
        in
        <br>
        -         * mmap write lock to serialize with munmap notifiers.
        <br>
        -         *
        <br>
        -         * Remove from deferred_list must be inside mmap write
        lock,
        <br>
        -         * otherwise, svm_range_list_lock_and_flush_work may
        hold mmap
        <br>
        -         * write lock, and continue because deferred_list is
        empty, then
        <br>
        -         * deferred_list handle is blocked by mmap write lock.
        <br>
        -         */
        <br>
        -        spin_lock(&svms->deferred_list_lock);
        <br>
        -        if (unlikely(svms->drain_pagefaults)) {
        <br>
        -            svms->drain_pagefaults = false;
        <br>
        -            spin_unlock(&svms->deferred_list_lock);
        <br>
        -            mutex_unlock(&svms->lock);
        <br>
        -            mmap_write_unlock(mm);
        <br>
        -            svm_range_drain_retry_fault(svms);
        <br>
        -            goto retry;
        <br>
        -        }
        <br>
        -        list_del_init(&prange->deferred_list);
        <br>
        -        spin_unlock(&svms->deferred_list_lock);
        <br>
        -
        <br>
                  mutex_lock(&prange->migrate_mutex);
        <br>
                  while (!list_empty(&prange->child_list)) {
        <br>
                      struct svm_range *pchild;
        <br>
        @@ -2031,12 +2034,13 @@ static void
        svm_range_deferred_list_work(struct work_struct *work)
        <br>
                    svm_range_handle_list_op(svms, prange);
        <br>
                  mutex_unlock(&svms->lock);
        <br>
        -        mmap_write_unlock(mm);
        <br>
                    spin_lock(&svms->deferred_list_lock);
        <br>
              }
        <br>
              spin_unlock(&svms->deferred_list_lock);
        <br>
          +    mmap_write_unlock(mm);
        <br>
        +    mmput(mm);
        <br>
              pr_debug("exit svms 0x%p\n", svms);
        <br>
          }
        <br>
          @@ -2600,12 +2604,8 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
                pr_debug("restoring svms 0x%p fault address 0x%llx\n",
        svms, addr);
        <br>
          -    mm = get_task_mm(p->lead_thread);
        <br>
        -    if (!mm) {
        <br>
        -        pr_debug("svms 0x%p failed to get mm\n", svms);
        <br>
        -        r = -ESRCH;
        <br>
        -        goto out;
        <br>
        -    }
        <br>
        +    /* mm is available because kfd_process_notifier_release
        drain fault */
        <br>
      </blockquote>
      This is not a valid assumption because the mm_users count is 0
      when the notifier_release runs. So you can't rely on the mm being
      usable here while you're draining faults in notifier_release.
      <br>
      <br>
      A better way to avoid the deadlock would be to drain faults not in
      notifier_release, but in kfd_process_wq_release.
      <br>
    </blockquote>
    <p>Good idea to drain faults in kfd_process_wq_release, then we can
      keep get_task_mm(pdd->process->lead_thread), if task mm is
      gone, it is safe to ignore the fault, return 0, not -ESRCH.<br>
    </p>
    <p>Regards,</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:fb6b3750-56e4-8a79-c479-10c4cbbd3e4a@amd.com">
      <br>
      Regards,
      <br>
        Felix
      <br>
      <br>
      <br>
      <blockquote type="cite">+    mm = p->mm;
        <br>
                mmap_read_lock(mm);
        <br>
          retry_write_locked:
        <br>
        @@ -2708,7 +2708,6 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
                svm_range_count_fault(adev, p, gpuidx);
        <br>
          -    mmput(mm);
        <br>
          out:
        <br>
              kfd_unref_process(p);
        <br>
          diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
        b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
        <br>
        index 6dc91c33e80f..0a8bcdb3dddf 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
        <br>
        @@ -189,6 +189,7 @@ void svm_range_prefault(struct svm_range
        *prange, struct mm_struct *mm,
        <br>
          struct kfd_process_device *
        <br>
          svm_range_get_pdd_by_adev(struct svm_range *prange, struct
        amdgpu_device *adev);
        <br>
          void svm_range_list_lock_and_flush_work(struct svm_range_list
        *svms, struct mm_struct *mm);
        <br>
        +void svm_range_drain_retry_fault(struct svm_range_list *svms);
        <br>
            /* SVM API and HMM page migration work together, device
        memory type
        <br>
           * is initialized to not 0 when page migration register device
        memory.
        <br>
      </blockquote>
    </blockquote>
  </body>
</html>