[PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault
Felix Kuehling
felix.kuehling at amd.com
Mon May 6 19:54:50 UTC 2024
On 2024-05-01 18:56, Philip Yang wrote:
> On system with khugepaged enabled and user cases with THP buffer, the
> hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary
> timeout value is not accurate, cause memory allocation failure.
>
> Remove the arbitrary timeout value, return EAGAIN to application if
> hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call
> ioctl again.
>
> Change EAGAIN to debug message as this is not error.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
Assuming this passes your stress testing without CPU stall warnings,
this patch is
Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 ++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 12 +++---------
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 +----
> 3 files changed, 8 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 54198c3928c7..02696c2102f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
>
> ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
> if (ret) {
> - pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
> + if (ret == -EAGAIN)
> + pr_debug("Failed to get user pages, try again\n");
> + else
> + pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
> goto unregister_out;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
> index 431ec72655ec..e36fede7f74c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
> @@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier,
> pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
> hmm_range->start, hmm_range->end);
>
> - /* Assuming 64MB takes maximum 1 second to fault page address */
> - timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
> - timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
> - timeout = jiffies + msecs_to_jiffies(timeout);
> + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
>
> retry:
> hmm_range->notifier_seq = mmu_interval_read_begin(notifier);
> r = hmm_range_fault(hmm_range);
> if (unlikely(r)) {
> - schedule();
> - /*
> - * FIXME: This timeout should encompass the retry from
> - * mmu_interval_read_retry() as well.
> - */
> if (r == -EBUSY && !time_after(jiffies, timeout))
> goto retry;
> goto out_free_pfns;
> @@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier,
> out_free_range:
> kfree(hmm_range);
>
> + if (r == -EBUSY)
> + r = -EAGAIN;
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 94f83be2232d..e7040f809f33 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1670,11 +1670,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
> readonly, owner, NULL,
> &hmm_range);
> WRITE_ONCE(p->svms.faulting_task, NULL);
> - if (r) {
> + if (r)
> pr_debug("failed %d to get svm range pages\n", r);
> - if (r == -EBUSY)
> - r = -EAGAIN;
> - }
> } else {
> r = -EFAULT;
> }
More information about the amd-gfx
mailing list