[PATCH 2/4] drm/amdgpu: fix userptr HMM range handling
Christian König
christian.koenig at amd.com
Tue Nov 15 13:37:15 UTC 2022
Am 10.11.22 um 22:55 schrieb Felix Kuehling:
>
> Am 2022-11-10 um 08:00 schrieb Christian König:
>> The basic problem here is that it's not allowed to page fault while
>> holding the reservation lock.
>>
>> So it can happen that multiple processes try to validate an userptr
>> at the same time.
>>
>> Work around that by putting the HMM range object into the mutex
>> protected bo list for now.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> CC: stable at vger.kernel.org
>> ---
>> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +++--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c | 1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h | 3 ++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 8 +--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 6 ++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 50 +++++--------------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 14 ++++--
>> 7 files changed, 43 insertions(+), 51 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index c5c9bfa2772e..83659e6419a8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -940,6 +940,7 @@ static int init_user_pages(struct kgd_mem *mem,
>> uint64_t user_addr,
>> struct amdkfd_process_info *process_info = mem->process_info;
>> struct amdgpu_bo *bo = mem->bo;
>> struct ttm_operation_ctx ctx = { true, false };
>> + struct hmm_range *range;
>
> I'd feel better if these local hmm_range pointers here and in
> update_invalid_user_pages and amdgpu_gem_userptr_ioctl were
> initialized to NULL. amdgpu_ttm_tt_get_user_pages leaves it
> uninitialized in case of errors and amdgpu_ttm_tt_get_user_pages_done
> checks for !range.
I've opted to initializing range to NULL in case of an error.
>
> With that fixed, the patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
Does that still counts?
Thanks,
Christian.
>
>
>> int ret = 0;
>> mutex_lock(&process_info->lock);
>> @@ -969,7 +970,7 @@ static int init_user_pages(struct kgd_mem *mem,
>> uint64_t user_addr,
>> return 0;
>> }
>> - ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> + ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
>> if (ret) {
>> pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>> goto unregister_out;
>> @@ -987,7 +988,7 @@ static int init_user_pages(struct kgd_mem *mem,
>> uint64_t user_addr,
>> amdgpu_bo_unreserve(bo);
>> release_out:
>> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>> unregister_out:
>> if (ret)
>> amdgpu_mn_unregister(bo);
>> @@ -2319,6 +2320,8 @@ static int update_invalid_user_pages(struct
>> amdkfd_process_info *process_info,
>> /* Go through userptr_inval_list and update any invalid
>> user_pages */
>> list_for_each_entry(mem, &process_info->userptr_inval_list,
>> validate_list.head) {
>> + struct hmm_range *range;
>> +
>> invalid = atomic_read(&mem->invalid);
>> if (!invalid)
>> /* BO hasn't been invalidated since the last
>> @@ -2329,7 +2332,8 @@ static int update_invalid_user_pages(struct
>> amdkfd_process_info *process_info,
>> bo = mem->bo;
>> /* Get updated user pages */
>> - ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> + ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
>> + &range);
>> if (ret) {
>> pr_debug("Failed %d to get user pages\n", ret);
>> @@ -2348,7 +2352,7 @@ static int update_invalid_user_pages(struct
>> amdkfd_process_info *process_info,
>> * FIXME: Cannot ignore the return code, must hold
>> * notifier_lock
>> */
>> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>> }
>> /* Mark the BO as valid unless it was invalidated
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> index 2168163aad2d..252a876b0725 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> @@ -209,6 +209,7 @@ void amdgpu_bo_list_get_list(struct
>> amdgpu_bo_list *list,
>> list_add_tail(&e->tv.head, &bucket[priority]);
>> e->user_pages = NULL;
>> + e->range = NULL;
>> }
>> /* Connect the sorted buckets in the output list. */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> index 9caea1688fc3..e4d78491bcc7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> @@ -26,6 +26,8 @@
>> #include <drm/ttm/ttm_execbuf_util.h>
>> #include <drm/amdgpu_drm.h>
>> +struct hmm_range;
>> +
>> struct amdgpu_device;
>> struct amdgpu_bo;
>> struct amdgpu_bo_va;
>> @@ -36,6 +38,7 @@ struct amdgpu_bo_list_entry {
>> struct amdgpu_bo_va *bo_va;
>> uint32_t priority;
>> struct page **user_pages;
>> + struct hmm_range *range;
>> bool user_invalidated;
>> };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index d371000a5727..7f9cedd8e157 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -910,7 +910,7 @@ static int amdgpu_cs_parser_bos(struct
>> amdgpu_cs_parser *p,
>> goto out_free_user_pages;
>> }
>> - r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages);
>> + r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages, &e->range);
>> if (r) {
>> kvfree(e->user_pages);
>> e->user_pages = NULL;
>> @@ -988,9 +988,10 @@ static int amdgpu_cs_parser_bos(struct
>> amdgpu_cs_parser *p,
>> if (!e->user_pages)
>> continue;
>> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
>> kvfree(e->user_pages);
>> e->user_pages = NULL;
>> + e->range = NULL;
>> }
>> mutex_unlock(&p->bo_list->bo_list_mutex);
>> return r;
>> @@ -1265,7 +1266,8 @@ static int amdgpu_cs_submit(struct
>> amdgpu_cs_parser *p,
>> amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
>> struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>> - r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> + r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
>> + e->range = NULL;
>> }
>> if (r) {
>> r = -EAGAIN;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index 111484ceb47d..91571b1324f2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -378,6 +378,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device
>> *dev, void *data,
>> struct amdgpu_device *adev = drm_to_adev(dev);
>> struct drm_amdgpu_gem_userptr *args = data;
>> struct drm_gem_object *gobj;
>> + struct hmm_range *range;
>> struct amdgpu_bo *bo;
>> uint32_t handle;
>> int r;
>> @@ -418,7 +419,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device
>> *dev, void *data,
>> goto release_object;
>> if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE) {
>> - r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> + r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
>> + &range);
>> if (r)
>> goto release_object;
>> @@ -441,7 +443,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device
>> *dev, void *data,
>> user_pages_done:
>> if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE)
>> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>> release_object:
>> drm_gem_object_put(gobj);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index 76a8ebfc9e71..a56d28bd23be 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -642,9 +642,6 @@ struct amdgpu_ttm_tt {
>> struct task_struct *usertask;
>> uint32_t userflags;
>> bool bound;
>> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>> - struct hmm_range *range;
>> -#endif
>> };
>> #define ttm_to_amdgpu_ttm_tt(ptr) container_of(ptr, struct
>> amdgpu_ttm_tt, ttm)
>> @@ -657,7 +654,8 @@ struct amdgpu_ttm_tt {
>> * Calling function must call amdgpu_ttm_tt_userptr_range_done()
>> once and only
>> * once afterwards to stop HMM tracking
>> */
>> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page
>> **pages)
>> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page
>> **pages,
>> + struct hmm_range **range)
>> {
>> struct ttm_tt *ttm = bo->tbo.ttm;
>> struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
>> @@ -673,10 +671,6 @@ int amdgpu_ttm_tt_get_user_pages(struct
>> amdgpu_bo *bo, struct page **pages)
>> return -EFAULT;
>> }
>> - /* Another get_user_pages is running at the same time?? */
>> - if (WARN_ON(gtt->range))
>> - return -EFAULT;
>> -
>> if (!mmget_not_zero(mm)) /* Happens during process shutdown */
>> return -ESRCH;
>> @@ -694,7 +688,7 @@ int amdgpu_ttm_tt_get_user_pages(struct
>> amdgpu_bo *bo, struct page **pages)
>> readonly = amdgpu_ttm_tt_is_readonly(ttm);
>> r = amdgpu_hmm_range_get_pages(&bo->notifier, mm, pages, start,
>> - ttm->num_pages, >t->range, readonly,
>> + ttm->num_pages, range, readonly,
>> true, NULL);
>> out_unlock:
>> mmap_read_unlock(mm);
>> @@ -712,30 +706,24 @@ int amdgpu_ttm_tt_get_user_pages(struct
>> amdgpu_bo *bo, struct page **pages)
>> *
>> * Returns: true if pages are still valid
>> */
>> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
>> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
>> + struct hmm_range *range)
>> {
>> struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
>> - bool r = false;
>> - if (!gtt || !gtt->userptr)
>> + if (!gtt || !gtt->userptr || !range)
>> return false;
>> DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%x\n",
>> gtt->userptr, ttm->num_pages);
>> - WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns,
>> - "No user pages to check\n");
>> + WARN_ONCE(!range->hmm_pfns, "No user pages to check\n");
>> - if (gtt->range) {
>> - /*
>> - * FIXME: Must always hold notifier_lock for this, and must
>> - * not ignore the return code.
>> - */
>> - r = amdgpu_hmm_range_get_pages_done(gtt->range);
>> - gtt->range = NULL;
>> - }
>> -
>> - return !r;
>> + /*
>> + * FIXME: Must always hold notifier_lock for this, and must
>> + * not ignore the return code.
>> + */
>> + return !amdgpu_hmm_range_get_pages_done(range);
>> }
>> #endif
>> @@ -812,20 +800,6 @@ static void amdgpu_ttm_tt_unpin_userptr(struct
>> ttm_device *bdev,
>> /* unmap the pages mapped to the device */
>> dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
>> sg_free_table(ttm->sg);
>> -
>> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>> - if (gtt->range) {
>> - unsigned long i;
>> -
>> - for (i = 0; i < ttm->num_pages; i++) {
>> - if (ttm->pages[i] !=
>> - hmm_pfn_to_page(gtt->range->hmm_pfns[i]))
>> - break;
>> - }
>> -
>> - WARN((i == ttm->num_pages), "Missing get_user_page_done\n");
>> - }
>> -#endif
>> }
>> static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> index 6a70818039dd..a37207011a69 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> @@ -39,6 +39,8 @@
>> #define AMDGPU_POISON 0xd0bed0be
>> +struct hmm_range;
>> +
>> struct amdgpu_gtt_mgr {
>> struct ttm_resource_manager manager;
>> struct drm_mm mm;
>> @@ -149,15 +151,19 @@ void amdgpu_ttm_recover_gart(struct
>> ttm_buffer_object *tbo);
>> uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev,
>> uint32_t type);
>> #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page
>> **pages);
>> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm);
>> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page
>> **pages,
>> + struct hmm_range **range);
>> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
>> + struct hmm_range *range);
>> #else
>> static inline int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo,
>> - struct page **pages)
>> + struct page **pages,
>> + struct hmm_range **range)
>> {
>> return -EPERM;
>> }
>> -static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt
>> *ttm)
>> +static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt
>> *ttm,
>> + struct hmm_range *range)
>> {
>> return false;
>> }
More information about the amd-gfx
mailing list