<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2024-05-02 00:09, Chen, Xiaogang
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:d949949b-fb30-45c7-a53d-a4d32874b3c7@amd.com">
      <br>
      On 5/1/2024 5:56 PM, Philip Yang wrote:
      <br>
      <blockquote type="cite">Caution: This message originated from an
        External Source. Use proper caution when opening attachments,
        clicking links, or responding.
        <br>
        <br>
        <br>
        On system with khugepaged enabled and user cases with THP
        buffer, the
        <br>
        hmm_range_fault may takes > 15 seconds to return -EBUSY, the
        arbitrary
        <br>
        timeout value is not accurate, cause memory allocation failure.
        <br>
        <br>
        Remove the arbitrary timeout value, return EAGAIN to application
        if
        <br>
        hmm_range_fault return EBUSY, then userspace libdrm and Thunk
        will call
        <br>
        ioctl again.
        <br>
      </blockquote>
      <br>
      Wonder why letting user space do retry is better? Seems this issue
      is caused by hugepage merging, so how user space can avoid it?
      <br>
    </blockquote>
    The issue is caused by khugepaged + 4 processes + sdma stalls test
    (to slow down sdma) + small_BAR + QPX mode, during overnight test,
    hmm_range_fault 180MB buffer may takes >15 seconds returns EBUSY,
    then alloc memory ioctl failed. Return EAGAIN, Thunk will call the
    alloc memory ioctl again, and we don't see the alloc memory
    failure.  <br>
    <blockquote type="cite" cite="mid:d949949b-fb30-45c7-a53d-a4d32874b3c7@amd.com">
      <br>
      And applications may not use Thunk or libdrm, instead, use ioctl
      directly.
      <br>
    </blockquote>
    <p>If app calls ioctl directly, it should do the same thing, to call
      ioctl again if errno is EINTR or EAGAIN.</p>
    <p>Regards,</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:d949949b-fb30-45c7-a53d-a4d32874b3c7@amd.com">
      <br>
      Regards
      <br>
      <br>
      Xiaogang
      <br>
      <br>
      <blockquote type="cite">Change EAGAIN to debug message as this is
        not error.
        <br>
        <br>
        Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
        <br>
        ---
        <br>
          drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  5 ++++-
        <br>
          drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c          | 12
        +++---------
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_svm.c             |  5 +----
        <br>
          3 files changed, 8 insertions(+), 14 deletions(-)
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
        <br>
        index 54198c3928c7..02696c2102f1 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
        <br>
        @@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem
        *mem, uint64_t user_addr,
        <br>
        <br>
                 ret = amdgpu_ttm_tt_get_user_pages(bo,
        bo->tbo.ttm->pages, &range);
        <br>
                 if (ret) {
        <br>
        -               pr_err("%s: Failed to get user pages: %d\n",
        __func__, ret);
        <br>
        +               if (ret == -EAGAIN)
        <br>
        +                       pr_debug("Failed to get user pages, try
        again\n");
        <br>
        +               else
        <br>
        +                       pr_err("%s: Failed to get user pages:
        %d\n", __func__, ret);
        <br>
                         goto unregister_out;
        <br>
                 }
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
        <br>
        index 431ec72655ec..e36fede7f74c 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
        <br>
        @@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct
        mmu_interval_notifier *notifier,
        <br>
                         pr_debug("hmm range: start = 0x%lx, end =
        0x%lx",
        <br>
                                 hmm_range->start,
        hmm_range->end);
        <br>
        <br>
        -               /* Assuming 64MB takes maximum 1 second to fault
        page address */
        <br>
        -               timeout = max((hmm_range->end -
        hmm_range->start) >> 26, 1UL);
        <br>
        -               timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
        <br>
        -               timeout = jiffies + msecs_to_jiffies(timeout);
        <br>
        +               timeout = jiffies +
        msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
        <br>
        <br>
          retry:
        <br>
                         hmm_range->notifier_seq =
        mmu_interval_read_begin(notifier);
        <br>
                         r = hmm_range_fault(hmm_range);
        <br>
                         if (unlikely(r)) {
        <br>
        -                       schedule();
        <br>
        -                       /*
        <br>
        -                        * FIXME: This timeout should encompass
        the retry from
        <br>
        -                        * mmu_interval_read_retry() as well.
        <br>
        -                        */
        <br>
                                 if (r == -EBUSY &&
        !time_after(jiffies, timeout))
        <br>
                                         goto retry;
        <br>
                                 goto out_free_pfns;
        <br>
        @@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct
        mmu_interval_notifier *notifier,
        <br>
          out_free_range:
        <br>
                 kfree(hmm_range);
        <br>
        <br>
        +       if (r == -EBUSY)
        <br>
        +               r = -EAGAIN;
        <br>
                 return r;
        <br>
          }
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        index 94f83be2232d..e7040f809f33 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        @@ -1670,11 +1670,8 @@ static int
        svm_range_validate_and_map(struct mm_struct *mm,
        <br>
                                                               
        readonly, owner, NULL,
        <br>
                                                               
        &hmm_range);
        <br>
                                 WRITE_ONCE(p->svms.faulting_task,
        NULL);
        <br>
        -                       if (r) {
        <br>
        +                       if (r)
        <br>
                                         pr_debug("failed %d to get svm
        range pages\n", r);
        <br>
        -                               if (r == -EBUSY)
        <br>
        -                                       r = -EAGAIN;
        <br>
        -                       }
        <br>
                         } else {
        <br>
                                 r = -EFAULT;
        <br>
                         }
        <br>
        --
        <br>
        2.43.2
        <br>
        <br>
      </blockquote>
    </blockquote>
  </body>
</html>