[PATCH 2/4] drm/amdgpu: fix userptr HMM range handling v2
Felix Kuehling
felix.kuehling at amd.com
Tue Nov 15 18:15:05 UTC 2022
On 2022-11-15 08:38, Christian König wrote:
> The basic problem here is that it's not allowed to page fault while
> holding the reservation lock.
>
> So it can happen that multiple processes try to validate an userptr
> at the same time.
>
> Work around that by putting the HMM range object into the mutex
> protected bo list for now.
>
> v2: make sure range is set to NULL in case of an error
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> CC: stable at vger.kernel.org
> ---
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h | 3 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 8 +--
> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 6 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 53 ++++++-------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 14 +++--
> 7 files changed, 46 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index c5c9bfa2772e..83659e6419a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -940,6 +940,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
> struct amdkfd_process_info *process_info = mem->process_info;
> struct amdgpu_bo *bo = mem->bo;
> struct ttm_operation_ctx ctx = { true, false };
> + struct hmm_range *range;
> int ret = 0;
>
> mutex_lock(&process_info->lock);
> @@ -969,7 +970,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
> return 0;
> }
>
> - ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
> + ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
> if (ret) {
> pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
> goto unregister_out;
> @@ -987,7 +988,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
> amdgpu_bo_unreserve(bo);
>
> release_out:
> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
> unregister_out:
> if (ret)
> amdgpu_mn_unregister(bo);
> @@ -2319,6 +2320,8 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
> /* Go through userptr_inval_list and update any invalid user_pages */
> list_for_each_entry(mem, &process_info->userptr_inval_list,
> validate_list.head) {
> + struct hmm_range *range;
> +
> invalid = atomic_read(&mem->invalid);
> if (!invalid)
> /* BO hasn't been invalidated since the last
> @@ -2329,7 +2332,8 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
> bo = mem->bo;
>
> /* Get updated user pages */
> - ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
> + ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
> + &range);
> if (ret) {
> pr_debug("Failed %d to get user pages\n", ret);
>
> @@ -2348,7 +2352,7 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
> * FIXME: Cannot ignore the return code, must hold
> * notifier_lock
> */
> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
> }
>
> /* Mark the BO as valid unless it was invalidated
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
> index 2168163aad2d..252a876b0725 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
> @@ -209,6 +209,7 @@ void amdgpu_bo_list_get_list(struct amdgpu_bo_list *list,
> list_add_tail(&e->tv.head, &bucket[priority]);
>
> e->user_pages = NULL;
> + e->range = NULL;
> }
>
> /* Connect the sorted buckets in the output list. */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
> index 9caea1688fc3..e4d78491bcc7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
> @@ -26,6 +26,8 @@
> #include <drm/ttm/ttm_execbuf_util.h>
> #include <drm/amdgpu_drm.h>
>
> +struct hmm_range;
> +
> struct amdgpu_device;
> struct amdgpu_bo;
> struct amdgpu_bo_va;
> @@ -36,6 +38,7 @@ struct amdgpu_bo_list_entry {
> struct amdgpu_bo_va *bo_va;
> uint32_t priority;
> struct page **user_pages;
> + struct hmm_range *range;
> bool user_invalidated;
> };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index d371000a5727..7f9cedd8e157 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -910,7 +910,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
> goto out_free_user_pages;
> }
>
> - r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages);
> + r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages, &e->range);
> if (r) {
> kvfree(e->user_pages);
> e->user_pages = NULL;
> @@ -988,9 +988,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>
> if (!e->user_pages)
> continue;
> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
> kvfree(e->user_pages);
> e->user_pages = NULL;
> + e->range = NULL;
> }
> mutex_unlock(&p->bo_list->bo_list_mutex);
> return r;
> @@ -1265,7 +1266,8 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
> amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
> struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>
> - r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> + r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
> + e->range = NULL;
> }
> if (r) {
> r = -EAGAIN;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 111484ceb47d..91571b1324f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -378,6 +378,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data,
> struct amdgpu_device *adev = drm_to_adev(dev);
> struct drm_amdgpu_gem_userptr *args = data;
> struct drm_gem_object *gobj;
> + struct hmm_range *range;
> struct amdgpu_bo *bo;
> uint32_t handle;
> int r;
> @@ -418,7 +419,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data,
> goto release_object;
>
> if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE) {
> - r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
> + r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
> + &range);
> if (r)
> goto release_object;
>
> @@ -441,7 +443,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data,
>
> user_pages_done:
> if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE)
> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>
> release_object:
> drm_gem_object_put(gobj);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 76a8ebfc9e71..29c22a27db80 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -642,9 +642,6 @@ struct amdgpu_ttm_tt {
> struct task_struct *usertask;
> uint32_t userflags;
> bool bound;
> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
> - struct hmm_range *range;
> -#endif
> };
>
> #define ttm_to_amdgpu_ttm_tt(ptr) container_of(ptr, struct amdgpu_ttm_tt, ttm)
> @@ -657,7 +654,8 @@ struct amdgpu_ttm_tt {
> * Calling function must call amdgpu_ttm_tt_userptr_range_done() once and only
> * once afterwards to stop HMM tracking
> */
> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages,
> + struct hmm_range **range)
> {
> struct ttm_tt *ttm = bo->tbo.ttm;
> struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
> @@ -667,16 +665,15 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
> bool readonly;
> int r = 0;
>
> + /* Make sure get_user_pages_done() can cleanup gracefully */
> + *range = NULL;
> +
> mm = bo->notifier.mm;
> if (unlikely(!mm)) {
> DRM_DEBUG_DRIVER("BO is not registered?\n");
> return -EFAULT;
> }
>
> - /* Another get_user_pages is running at the same time?? */
> - if (WARN_ON(gtt->range))
> - return -EFAULT;
> -
> if (!mmget_not_zero(mm)) /* Happens during process shutdown */
> return -ESRCH;
>
> @@ -694,7 +691,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
>
> readonly = amdgpu_ttm_tt_is_readonly(ttm);
> r = amdgpu_hmm_range_get_pages(&bo->notifier, mm, pages, start,
> - ttm->num_pages, >t->range, readonly,
> + ttm->num_pages, range, readonly,
> true, NULL);
> out_unlock:
> mmap_read_unlock(mm);
> @@ -712,30 +709,24 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
> *
> * Returns: true if pages are still valid
> */
> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
> + struct hmm_range *range)
> {
> struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
> - bool r = false;
>
> - if (!gtt || !gtt->userptr)
> + if (!gtt || !gtt->userptr || !range)
> return false;
>
> DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%x\n",
> gtt->userptr, ttm->num_pages);
>
> - WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns,
> - "No user pages to check\n");
> + WARN_ONCE(!range->hmm_pfns, "No user pages to check\n");
>
> - if (gtt->range) {
> - /*
> - * FIXME: Must always hold notifier_lock for this, and must
> - * not ignore the return code.
> - */
> - r = amdgpu_hmm_range_get_pages_done(gtt->range);
> - gtt->range = NULL;
> - }
> -
> - return !r;
> + /*
> + * FIXME: Must always hold notifier_lock for this, and must
> + * not ignore the return code.
> + */
> + return !amdgpu_hmm_range_get_pages_done(range);
> }
> #endif
>
> @@ -812,20 +803,6 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_device *bdev,
> /* unmap the pages mapped to the device */
> dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
> sg_free_table(ttm->sg);
> -
> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
> - if (gtt->range) {
> - unsigned long i;
> -
> - for (i = 0; i < ttm->num_pages; i++) {
> - if (ttm->pages[i] !=
> - hmm_pfn_to_page(gtt->range->hmm_pfns[i]))
> - break;
> - }
> -
> - WARN((i == ttm->num_pages), "Missing get_user_page_done\n");
> - }
> -#endif
> }
>
> static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index 6a70818039dd..a37207011a69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -39,6 +39,8 @@
>
> #define AMDGPU_POISON 0xd0bed0be
>
> +struct hmm_range;
> +
> struct amdgpu_gtt_mgr {
> struct ttm_resource_manager manager;
> struct drm_mm mm;
> @@ -149,15 +151,19 @@ void amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo);
> uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev, uint32_t type);
>
> #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages);
> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm);
> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages,
> + struct hmm_range **range);
> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
> + struct hmm_range *range);
> #else
> static inline int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo,
> - struct page **pages)
> + struct page **pages,
> + struct hmm_range **range)
> {
> return -EPERM;
> }
> -static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
> +static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
> + struct hmm_range *range)
> {
> return false;
> }
More information about the amd-gfx
mailing list