<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2022-03-14 3:58 p.m., Felix Kuehling
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:486b326f-8842-5de4-07ad-7ab1f209132e@amd.com">Am
      2022-03-14 um 10:50 schrieb Philip Yang:
      <br>
      <blockquote type="cite">Migrate vram to ram may fail to find the
        vma if process is exiting
        <br>
        and vma is removed, evict svm bo worker sets prange->svm_bo
        to NULL
        <br>
        and warn svm_bo ref count != 1 only if migrating vram to ram
        <br>
        successfully.
        <br>
        <br>
        Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
        <br>
        ---
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 27
        +++++++++++++++++++-----
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 20
        ++++++++++--------
        <br>
          2 files changed, 33 insertions(+), 14 deletions(-)
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
        <br>
        index 7f689094be5a..c8aef2fe459b 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
        <br>
        @@ -638,6 +638,22 @@ svm_migrate_copy_to_ram(struct
        amdgpu_device *adev, struct svm_range *prange,
        <br>
              return r;
        <br>
          }
        <br>
          +/**
        <br>
        + * svm_migrate_vma_to_ram - migrate range inside one vma from
        device to system
        <br>
        + *
        <br>
        + * @adev: amdgpu device to migrate from
        <br>
        + * @prange: svm range structure
        <br>
        + * @vma: vm_area_struct that range [start, end] belongs to
        <br>
        + * @start: range start virtual address in pages
        <br>
        + * @end: range end virtual address in pages
        <br>
        + *
        <br>
        + * Context: Process context, caller hold mmap read lock, svms
        lock, prange lock
        <br>
        + *
        <br>
        + * Return:
        <br>
        + *   0 - success with all pages migrated
        <br>
        + *   negative values - indicate error
        <br>
        + *   positive values - partial migration, number of pages not
        migrated
        <br>
        + */
        <br>
          static long
        <br>
          svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct
        svm_range *prange,
        <br>
                         struct vm_area_struct *vma, uint64_t start,
        uint64_t end)
        <br>
        @@ -709,8 +725,6 @@ svm_migrate_vma_to_ram(struct amdgpu_device
        *adev, struct svm_range *prange,
        <br>
                  pdd = svm_range_get_pdd_by_adev(prange, adev);
        <br>
                  if (pdd)
        <br>
                      WRITE_ONCE(pdd->page_out, pdd->page_out +
        cpages);
        <br>
        -
        <br>
        -        return upages;
        <br>
              }
        <br>
              return r ? r : upages;
        <br>
          }
        <br>
        @@ -759,13 +773,16 @@ int svm_migrate_vram_to_ram(struct
        svm_range *prange, struct mm_struct *mm)
        <br>
                  unsigned long next;
        <br>
                    vma = find_vma(mm, addr);
        <br>
        -        if (!vma || addr < vma->vm_start)
        <br>
        +        if (!vma || addr < vma->vm_start) {
        <br>
        +            pr_debug("failed to find vma for prange %p\n",
        prange);
        <br>
        +            r = -EFAULT;
        <br>
                      break;
        <br>
        +        }
        <br>
                    next = min(vma->vm_end, end);
        <br>
                  r = svm_migrate_vma_to_ram(adev, prange, vma, addr,
        next);
        <br>
                  if (r < 0) {
        <br>
        -            pr_debug("failed %ld to migrate\n", r);
        <br>
        +            pr_debug("failed %ld to migrate prange %p\n", r,
        prange);
        <br>
                      break;
        <br>
                  } else {
        <br>
                      upages += r;
        <br>
        @@ -773,7 +790,7 @@ int svm_migrate_vram_to_ram(struct svm_range
        *prange, struct mm_struct *mm)
        <br>
                  addr = next;
        <br>
              }
        <br>
          -    if (!upages) {
        <br>
        +    if (r >= 0 && !upages) {
        <br>
                  svm_range_vram_node_free(prange);
        <br>
                  prange->actual_loc = 0;
        <br>
              }
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        index 509d915cbe69..215943424c06 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        @@ -3155,6 +3155,7 @@ static void
        svm_range_evict_svm_bo_worker(struct work_struct *work)
        <br>
              struct svm_range_bo *svm_bo;
        <br>
              struct kfd_process *p;
        <br>
              struct mm_struct *mm;
        <br>
        +    int r = 0;
        <br>
                svm_bo = container_of(work, struct svm_range_bo,
        eviction_work);
        <br>
              if (!svm_bo_ref_unless_zero(svm_bo))
        <br>
        @@ -3170,7 +3171,7 @@ static void
        svm_range_evict_svm_bo_worker(struct work_struct *work)
        <br>
                mmap_read_lock(mm);
        <br>
              spin_lock(&svm_bo->list_lock);
        <br>
        -    while (!list_empty(&svm_bo->range_list)) {
        <br>
        +    while (!list_empty(&svm_bo->range_list) &&
        !r) {
        <br>
                  struct svm_range *prange =
        <br>
                          list_first_entry(&svm_bo->range_list,
        <br>
                                  struct svm_range, svm_bo_list);
        <br>
        @@ -3184,15 +3185,15 @@ static void
        svm_range_evict_svm_bo_worker(struct work_struct *work)
        <br>
                    mutex_lock(&prange->migrate_mutex);
        <br>
                  do {
        <br>
        -            svm_migrate_vram_to_ram(prange,
        <br>
        +            r = svm_migrate_vram_to_ram(prange,
        <br>
                                  svm_bo->eviction_fence->mm);
        <br>
        -        } while (prange->actual_loc && --retries);
        <br>
        -        WARN(prange->actual_loc, "Migration failed during
        eviction");
        <br>
        -
        <br>
        -        mutex_lock(&prange->lock);
        <br>
        -        prange->svm_bo = NULL;
        <br>
        -        mutex_unlock(&prange->lock);
        <br>
        +        } while (!r && prange->actual_loc &&
        --retries);
        <br>
      </blockquote>
      <br>
      I think it would still be good to have a WARN here for other cases
      than process termination. Is there a way to distinguish the
      process termination case from the error code? Maybe we could even
      avoid the retry in this case.
      <br>
      <br>
    </blockquote>
    <p>It was bug that prange->actual_loc set to 0 even if vma not
      found, that's why this WARN never trigger. With this fix, the WARN
      is kind of duplicate with below WARN_ONCE. Change this to
      pr_info_once to help debug, as below WARN_ONCE is real critical
      issue to notify TTM to alloc VRAM from BO while svm_bo ref count
      is not 1.</p>
    <p>retry is avoided if r return error code.</p>
    <p>Regards,</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:486b326f-8842-5de4-07ad-7ab1f209132e@amd.com">Other than
      that, this patch is a good improvement on the error handling.
      <br>
      <br>
      Regards,
      <br>
        Felix
      <br>
      <br>
      <br>
      <blockquote type="cite">  +        if (!prange->actual_loc) {
        <br>
        +            mutex_lock(&prange->lock);
        <br>
        +            prange->svm_bo = NULL;
        <br>
        +            mutex_unlock(&prange->lock);
        <br>
        +        }
        <br>
                  mutex_unlock(&prange->migrate_mutex);
        <br>
                    spin_lock(&svm_bo->list_lock);
        <br>
        @@ -3201,10 +3202,11 @@ static void
        svm_range_evict_svm_bo_worker(struct work_struct *work)
        <br>
              mmap_read_unlock(mm);
        <br>
               
        dma_fence_signal(&svm_bo->eviction_fence->base);
        <br>
        +
        <br>
              /* This is the last reference to svm_bo, after
        svm_range_vram_node_free
        <br>
               * has been called in svm_migrate_vram_to_ram
        <br>
               */
        <br>
        -    WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was
        not the last reference\n");
        <br>
        +    WARN_ONCE(!r && kref_read(&svm_bo->kref) !=
        1, "This was not the last reference\n");
        <br>
              svm_range_bo_unref(svm_bo);
        <br>
          }
        <br>
          </blockquote>
    </blockquote>
  </body>
</html>