[PATCH 2/4] drm/amdgpu: fix userptr HMM range handling

Felix Kuehling felix.kuehling at amd.com
Tue Nov 15 15:36:43 UTC 2022


Am 2022-11-15 um 08:37 schrieb Christian König:
> Am 10.11.22 um 22:55 schrieb Felix Kuehling:
>>
>> Am 2022-11-10 um 08:00 schrieb Christian König:
>>> The basic problem here is that it's not allowed to page fault while
>>> holding the reservation lock.
>>>
>>> So it can happen that multiple processes try to validate an userptr
>>> at the same time.
>>>
>>> Work around that by putting the HMM range object into the mutex
>>> protected bo list for now.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> CC: stable at vger.kernel.org
>>> ---
>>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 12 +++--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c   |  1 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h   |  3 ++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  8 +--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |  6 ++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       | 50 
>>> +++++--------------
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h       | 14 ++++--
>>>   7 files changed, 43 insertions(+), 51 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index c5c9bfa2772e..83659e6419a8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -940,6 +940,7 @@ static int init_user_pages(struct kgd_mem *mem, 
>>> uint64_t user_addr,
>>>       struct amdkfd_process_info *process_info = mem->process_info;
>>>       struct amdgpu_bo *bo = mem->bo;
>>>       struct ttm_operation_ctx ctx = { true, false };
>>> +    struct hmm_range *range;
>>
>> I'd feel better if these local hmm_range pointers here and in 
>> update_invalid_user_pages and amdgpu_gem_userptr_ioctl were 
>> initialized to NULL. amdgpu_ttm_tt_get_user_pages leaves it 
>> uninitialized in case of errors and amdgpu_ttm_tt_get_user_pages_done 
>> checks for !range.
>
> I've opted to initializing range to NULL in case of an error.
>
>>
>> With that fixed, the patch is
>>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
> Does that still counts?

If you don't mind, please send the updated patch again. It's always good 
to have a second pair of eyes on error handling.

Thanks,
   Felix


>
> Thanks,
> Christian.
>
>>
>>
>>>       int ret = 0;
>>>         mutex_lock(&process_info->lock);
>>> @@ -969,7 +970,7 @@ static int init_user_pages(struct kgd_mem *mem, 
>>> uint64_t user_addr,
>>>           return 0;
>>>       }
>>>   -    ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +    ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, 
>>> &range);
>>>       if (ret) {
>>>           pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>>           goto unregister_out;
>>> @@ -987,7 +988,7 @@ static int init_user_pages(struct kgd_mem *mem, 
>>> uint64_t user_addr,
>>>       amdgpu_bo_unreserve(bo);
>>>     release_out:
>>> -    amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> +    amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>>>   unregister_out:
>>>       if (ret)
>>>           amdgpu_mn_unregister(bo);
>>> @@ -2319,6 +2320,8 @@ static int update_invalid_user_pages(struct 
>>> amdkfd_process_info *process_info,
>>>       /* Go through userptr_inval_list and update any invalid 
>>> user_pages */
>>>       list_for_each_entry(mem, &process_info->userptr_inval_list,
>>>                   validate_list.head) {
>>> +        struct hmm_range *range;
>>> +
>>>           invalid = atomic_read(&mem->invalid);
>>>           if (!invalid)
>>>               /* BO hasn't been invalidated since the last
>>> @@ -2329,7 +2332,8 @@ static int update_invalid_user_pages(struct 
>>> amdkfd_process_info *process_info,
>>>           bo = mem->bo;
>>>             /* Get updated user pages */
>>> -        ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +        ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
>>> +                           &range);
>>>           if (ret) {
>>>               pr_debug("Failed %d to get user pages\n", ret);
>>>   @@ -2348,7 +2352,7 @@ static int update_invalid_user_pages(struct 
>>> amdkfd_process_info *process_info,
>>>                * FIXME: Cannot ignore the return code, must hold
>>>                * notifier_lock
>>>                */
>>> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> +            amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>>>           }
>>>             /* Mark the BO as valid unless it was invalidated
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> index 2168163aad2d..252a876b0725 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> @@ -209,6 +209,7 @@ void amdgpu_bo_list_get_list(struct 
>>> amdgpu_bo_list *list,
>>>               list_add_tail(&e->tv.head, &bucket[priority]);
>>>             e->user_pages = NULL;
>>> +        e->range = NULL;
>>>       }
>>>         /* Connect the sorted buckets in the output list. */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> index 9caea1688fc3..e4d78491bcc7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> @@ -26,6 +26,8 @@
>>>   #include <drm/ttm/ttm_execbuf_util.h>
>>>   #include <drm/amdgpu_drm.h>
>>>   +struct hmm_range;
>>> +
>>>   struct amdgpu_device;
>>>   struct amdgpu_bo;
>>>   struct amdgpu_bo_va;
>>> @@ -36,6 +38,7 @@ struct amdgpu_bo_list_entry {
>>>       struct amdgpu_bo_va        *bo_va;
>>>       uint32_t            priority;
>>>       struct page            **user_pages;
>>> +    struct hmm_range        *range;
>>>       bool                user_invalidated;
>>>   };
>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index d371000a5727..7f9cedd8e157 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -910,7 +910,7 @@ static int amdgpu_cs_parser_bos(struct 
>>> amdgpu_cs_parser *p,
>>>               goto out_free_user_pages;
>>>           }
>>>   -        r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages);
>>> +        r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages, 
>>> &e->range);
>>>           if (r) {
>>>               kvfree(e->user_pages);
>>>               e->user_pages = NULL;
>>> @@ -988,9 +988,10 @@ static int amdgpu_cs_parser_bos(struct 
>>> amdgpu_cs_parser *p,
>>>             if (!e->user_pages)
>>>               continue;
>>> -        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> +        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
>>>           kvfree(e->user_pages);
>>>           e->user_pages = NULL;
>>> +        e->range = NULL;
>>>       }
>>>       mutex_unlock(&p->bo_list->bo_list_mutex);
>>>       return r;
>>> @@ -1265,7 +1266,8 @@ static int amdgpu_cs_submit(struct 
>>> amdgpu_cs_parser *p,
>>>       amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
>>>           struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>>>   -        r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> +        r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, 
>>> e->range);
>>> +        e->range = NULL;
>>>       }
>>>       if (r) {
>>>           r = -EAGAIN;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index 111484ceb47d..91571b1324f2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -378,6 +378,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device 
>>> *dev, void *data,
>>>       struct amdgpu_device *adev = drm_to_adev(dev);
>>>       struct drm_amdgpu_gem_userptr *args = data;
>>>       struct drm_gem_object *gobj;
>>> +    struct hmm_range *range;
>>>       struct amdgpu_bo *bo;
>>>       uint32_t handle;
>>>       int r;
>>> @@ -418,7 +419,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device 
>>> *dev, void *data,
>>>           goto release_object;
>>>         if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE) {
>>> -        r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +        r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
>>> +                         &range);
>>>           if (r)
>>>               goto release_object;
>>>   @@ -441,7 +443,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device 
>>> *dev, void *data,
>>>     user_pages_done:
>>>       if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE)
>>> -        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> +        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>>>     release_object:
>>>       drm_gem_object_put(gobj);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> index 76a8ebfc9e71..a56d28bd23be 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> @@ -642,9 +642,6 @@ struct amdgpu_ttm_tt {
>>>       struct task_struct    *usertask;
>>>       uint32_t        userflags;
>>>       bool            bound;
>>> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>>> -    struct hmm_range    *range;
>>> -#endif
>>>   };
>>>     #define ttm_to_amdgpu_ttm_tt(ptr)    container_of(ptr, struct 
>>> amdgpu_ttm_tt, ttm)
>>> @@ -657,7 +654,8 @@ struct amdgpu_ttm_tt {
>>>    * Calling function must call amdgpu_ttm_tt_userptr_range_done() 
>>> once and only
>>>    * once afterwards to stop HMM tracking
>>>    */
>>> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page 
>>> **pages)
>>> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page 
>>> **pages,
>>> +                 struct hmm_range **range)
>>>   {
>>>       struct ttm_tt *ttm = bo->tbo.ttm;
>>>       struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
>>> @@ -673,10 +671,6 @@ int amdgpu_ttm_tt_get_user_pages(struct 
>>> amdgpu_bo *bo, struct page **pages)
>>>           return -EFAULT;
>>>       }
>>>   -    /* Another get_user_pages is running at the same time?? */
>>> -    if (WARN_ON(gtt->range))
>>> -        return -EFAULT;
>>> -
>>>       if (!mmget_not_zero(mm)) /* Happens during process shutdown */
>>>           return -ESRCH;
>>>   @@ -694,7 +688,7 @@ int amdgpu_ttm_tt_get_user_pages(struct 
>>> amdgpu_bo *bo, struct page **pages)
>>>         readonly = amdgpu_ttm_tt_is_readonly(ttm);
>>>       r = amdgpu_hmm_range_get_pages(&bo->notifier, mm, pages, start,
>>> -                       ttm->num_pages, &gtt->range, readonly,
>>> +                       ttm->num_pages, range, readonly,
>>>                          true, NULL);
>>>   out_unlock:
>>>       mmap_read_unlock(mm);
>>> @@ -712,30 +706,24 @@ int amdgpu_ttm_tt_get_user_pages(struct 
>>> amdgpu_bo *bo, struct page **pages)
>>>    *
>>>    * Returns: true if pages are still valid
>>>    */
>>> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
>>> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
>>> +                       struct hmm_range *range)
>>>   {
>>>       struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
>>> -    bool r = false;
>>>   -    if (!gtt || !gtt->userptr)
>>> +    if (!gtt || !gtt->userptr || !range)
>>>           return false;
>>>         DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%x\n",
>>>           gtt->userptr, ttm->num_pages);
>>>   -    WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns,
>>> -        "No user pages to check\n");
>>> +    WARN_ONCE(!range->hmm_pfns, "No user pages to check\n");
>>>   -    if (gtt->range) {
>>> -        /*
>>> -         * FIXME: Must always hold notifier_lock for this, and must
>>> -         * not ignore the return code.
>>> -         */
>>> -        r = amdgpu_hmm_range_get_pages_done(gtt->range);
>>> -        gtt->range = NULL;
>>> -    }
>>> -
>>> -    return !r;
>>> +    /*
>>> +     * FIXME: Must always hold notifier_lock for this, and must
>>> +     * not ignore the return code.
>>> +     */
>>> +    return !amdgpu_hmm_range_get_pages_done(range);
>>>   }
>>>   #endif
>>>   @@ -812,20 +800,6 @@ static void 
>>> amdgpu_ttm_tt_unpin_userptr(struct ttm_device *bdev,
>>>       /* unmap the pages mapped to the device */
>>>       dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
>>>       sg_free_table(ttm->sg);
>>> -
>>> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>>> -    if (gtt->range) {
>>> -        unsigned long i;
>>> -
>>> -        for (i = 0; i < ttm->num_pages; i++) {
>>> -            if (ttm->pages[i] !=
>>> - hmm_pfn_to_page(gtt->range->hmm_pfns[i]))
>>> -                break;
>>> -        }
>>> -
>>> -        WARN((i == ttm->num_pages), "Missing get_user_page_done\n");
>>> -    }
>>> -#endif
>>>   }
>>>     static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> index 6a70818039dd..a37207011a69 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> @@ -39,6 +39,8 @@
>>>     #define AMDGPU_POISON    0xd0bed0be
>>>   +struct hmm_range;
>>> +
>>>   struct amdgpu_gtt_mgr {
>>>       struct ttm_resource_manager manager;
>>>       struct drm_mm mm;
>>> @@ -149,15 +151,19 @@ void amdgpu_ttm_recover_gart(struct 
>>> ttm_buffer_object *tbo);
>>>   uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev, 
>>> uint32_t type);
>>>     #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>>> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page 
>>> **pages);
>>> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm);
>>> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page 
>>> **pages,
>>> +                 struct hmm_range **range);
>>> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
>>> +                       struct hmm_range *range);
>>>   #else
>>>   static inline int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo,
>>> -                           struct page **pages)
>>> +                           struct page **pages,
>>> +                           struct hmm_range **range)
>>>   {
>>>       return -EPERM;
>>>   }
>>> -static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt 
>>> *ttm)
>>> +static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt 
>>> *ttm,
>>> +                             struct hmm_range *range)
>>>   {
>>>       return false;
>>>   }
>


More information about the amd-gfx mailing list