[PATCH 2/4] drm/amdgpu: fix userptr HMM range handling

Christian König christian.koenig at amd.com
Tue Nov 15 13:37:15 UTC 2022


Am 10.11.22 um 22:55 schrieb Felix Kuehling:
>
> Am 2022-11-10 um 08:00 schrieb Christian König:
>> The basic problem here is that it's not allowed to page fault while
>> holding the reservation lock.
>>
>> So it can happen that multiple processes try to validate an userptr
>> at the same time.
>>
>> Work around that by putting the HMM range object into the mutex
>> protected bo list for now.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> CC: stable at vger.kernel.org
>> ---
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 12 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c   |  1 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h   |  3 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  8 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |  6 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       | 50 +++++--------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h       | 14 ++++--
>>   7 files changed, 43 insertions(+), 51 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index c5c9bfa2772e..83659e6419a8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -940,6 +940,7 @@ static int init_user_pages(struct kgd_mem *mem, 
>> uint64_t user_addr,
>>       struct amdkfd_process_info *process_info = mem->process_info;
>>       struct amdgpu_bo *bo = mem->bo;
>>       struct ttm_operation_ctx ctx = { true, false };
>> +    struct hmm_range *range;
>
> I'd feel better if these local hmm_range pointers here and in 
> update_invalid_user_pages and amdgpu_gem_userptr_ioctl were 
> initialized to NULL. amdgpu_ttm_tt_get_user_pages leaves it 
> uninitialized in case of errors and amdgpu_ttm_tt_get_user_pages_done 
> checks for !range.

I've opted to initializing range to NULL in case of an error.

>
> With that fixed, the patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>

Does that still counts?

Thanks,
Christian.

>
>
>>       int ret = 0;
>>         mutex_lock(&process_info->lock);
>> @@ -969,7 +970,7 @@ static int init_user_pages(struct kgd_mem *mem, 
>> uint64_t user_addr,
>>           return 0;
>>       }
>>   -    ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> +    ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
>>       if (ret) {
>>           pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>           goto unregister_out;
>> @@ -987,7 +988,7 @@ static int init_user_pages(struct kgd_mem *mem, 
>> uint64_t user_addr,
>>       amdgpu_bo_unreserve(bo);
>>     release_out:
>> -    amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> +    amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>>   unregister_out:
>>       if (ret)
>>           amdgpu_mn_unregister(bo);
>> @@ -2319,6 +2320,8 @@ static int update_invalid_user_pages(struct 
>> amdkfd_process_info *process_info,
>>       /* Go through userptr_inval_list and update any invalid 
>> user_pages */
>>       list_for_each_entry(mem, &process_info->userptr_inval_list,
>>                   validate_list.head) {
>> +        struct hmm_range *range;
>> +
>>           invalid = atomic_read(&mem->invalid);
>>           if (!invalid)
>>               /* BO hasn't been invalidated since the last
>> @@ -2329,7 +2332,8 @@ static int update_invalid_user_pages(struct 
>> amdkfd_process_info *process_info,
>>           bo = mem->bo;
>>             /* Get updated user pages */
>> -        ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> +        ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
>> +                           &range);
>>           if (ret) {
>>               pr_debug("Failed %d to get user pages\n", ret);
>>   @@ -2348,7 +2352,7 @@ static int update_invalid_user_pages(struct 
>> amdkfd_process_info *process_info,
>>                * FIXME: Cannot ignore the return code, must hold
>>                * notifier_lock
>>                */
>> -            amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> +            amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>>           }
>>             /* Mark the BO as valid unless it was invalidated
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> index 2168163aad2d..252a876b0725 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> @@ -209,6 +209,7 @@ void amdgpu_bo_list_get_list(struct 
>> amdgpu_bo_list *list,
>>               list_add_tail(&e->tv.head, &bucket[priority]);
>>             e->user_pages = NULL;
>> +        e->range = NULL;
>>       }
>>         /* Connect the sorted buckets in the output list. */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> index 9caea1688fc3..e4d78491bcc7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> @@ -26,6 +26,8 @@
>>   #include <drm/ttm/ttm_execbuf_util.h>
>>   #include <drm/amdgpu_drm.h>
>>   +struct hmm_range;
>> +
>>   struct amdgpu_device;
>>   struct amdgpu_bo;
>>   struct amdgpu_bo_va;
>> @@ -36,6 +38,7 @@ struct amdgpu_bo_list_entry {
>>       struct amdgpu_bo_va        *bo_va;
>>       uint32_t            priority;
>>       struct page            **user_pages;
>> +    struct hmm_range        *range;
>>       bool                user_invalidated;
>>   };
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index d371000a5727..7f9cedd8e157 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -910,7 +910,7 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>               goto out_free_user_pages;
>>           }
>>   -        r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages);
>> +        r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages, &e->range);
>>           if (r) {
>>               kvfree(e->user_pages);
>>               e->user_pages = NULL;
>> @@ -988,9 +988,10 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>             if (!e->user_pages)
>>               continue;
>> -        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> +        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
>>           kvfree(e->user_pages);
>>           e->user_pages = NULL;
>> +        e->range = NULL;
>>       }
>>       mutex_unlock(&p->bo_list->bo_list_mutex);
>>       return r;
>> @@ -1265,7 +1266,8 @@ static int amdgpu_cs_submit(struct 
>> amdgpu_cs_parser *p,
>>       amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
>>           struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>>   -        r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> +        r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
>> +        e->range = NULL;
>>       }
>>       if (r) {
>>           r = -EAGAIN;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index 111484ceb47d..91571b1324f2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -378,6 +378,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device 
>> *dev, void *data,
>>       struct amdgpu_device *adev = drm_to_adev(dev);
>>       struct drm_amdgpu_gem_userptr *args = data;
>>       struct drm_gem_object *gobj;
>> +    struct hmm_range *range;
>>       struct amdgpu_bo *bo;
>>       uint32_t handle;
>>       int r;
>> @@ -418,7 +419,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device 
>> *dev, void *data,
>>           goto release_object;
>>         if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE) {
>> -        r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> +        r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
>> +                         &range);
>>           if (r)
>>               goto release_object;
>>   @@ -441,7 +443,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device 
>> *dev, void *data,
>>     user_pages_done:
>>       if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE)
>> -        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>> +        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>>     release_object:
>>       drm_gem_object_put(gobj);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index 76a8ebfc9e71..a56d28bd23be 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -642,9 +642,6 @@ struct amdgpu_ttm_tt {
>>       struct task_struct    *usertask;
>>       uint32_t        userflags;
>>       bool            bound;
>> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>> -    struct hmm_range    *range;
>> -#endif
>>   };
>>     #define ttm_to_amdgpu_ttm_tt(ptr)    container_of(ptr, struct 
>> amdgpu_ttm_tt, ttm)
>> @@ -657,7 +654,8 @@ struct amdgpu_ttm_tt {
>>    * Calling function must call amdgpu_ttm_tt_userptr_range_done() 
>> once and only
>>    * once afterwards to stop HMM tracking
>>    */
>> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page 
>> **pages)
>> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page 
>> **pages,
>> +                 struct hmm_range **range)
>>   {
>>       struct ttm_tt *ttm = bo->tbo.ttm;
>>       struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
>> @@ -673,10 +671,6 @@ int amdgpu_ttm_tt_get_user_pages(struct 
>> amdgpu_bo *bo, struct page **pages)
>>           return -EFAULT;
>>       }
>>   -    /* Another get_user_pages is running at the same time?? */
>> -    if (WARN_ON(gtt->range))
>> -        return -EFAULT;
>> -
>>       if (!mmget_not_zero(mm)) /* Happens during process shutdown */
>>           return -ESRCH;
>>   @@ -694,7 +688,7 @@ int amdgpu_ttm_tt_get_user_pages(struct 
>> amdgpu_bo *bo, struct page **pages)
>>         readonly = amdgpu_ttm_tt_is_readonly(ttm);
>>       r = amdgpu_hmm_range_get_pages(&bo->notifier, mm, pages, start,
>> -                       ttm->num_pages, &gtt->range, readonly,
>> +                       ttm->num_pages, range, readonly,
>>                          true, NULL);
>>   out_unlock:
>>       mmap_read_unlock(mm);
>> @@ -712,30 +706,24 @@ int amdgpu_ttm_tt_get_user_pages(struct 
>> amdgpu_bo *bo, struct page **pages)
>>    *
>>    * Returns: true if pages are still valid
>>    */
>> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
>> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
>> +                       struct hmm_range *range)
>>   {
>>       struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
>> -    bool r = false;
>>   -    if (!gtt || !gtt->userptr)
>> +    if (!gtt || !gtt->userptr || !range)
>>           return false;
>>         DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%x\n",
>>           gtt->userptr, ttm->num_pages);
>>   -    WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns,
>> -        "No user pages to check\n");
>> +    WARN_ONCE(!range->hmm_pfns, "No user pages to check\n");
>>   -    if (gtt->range) {
>> -        /*
>> -         * FIXME: Must always hold notifier_lock for this, and must
>> -         * not ignore the return code.
>> -         */
>> -        r = amdgpu_hmm_range_get_pages_done(gtt->range);
>> -        gtt->range = NULL;
>> -    }
>> -
>> -    return !r;
>> +    /*
>> +     * FIXME: Must always hold notifier_lock for this, and must
>> +     * not ignore the return code.
>> +     */
>> +    return !amdgpu_hmm_range_get_pages_done(range);
>>   }
>>   #endif
>>   @@ -812,20 +800,6 @@ static void amdgpu_ttm_tt_unpin_userptr(struct 
>> ttm_device *bdev,
>>       /* unmap the pages mapped to the device */
>>       dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
>>       sg_free_table(ttm->sg);
>> -
>> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>> -    if (gtt->range) {
>> -        unsigned long i;
>> -
>> -        for (i = 0; i < ttm->num_pages; i++) {
>> -            if (ttm->pages[i] !=
>> -                hmm_pfn_to_page(gtt->range->hmm_pfns[i]))
>> -                break;
>> -        }
>> -
>> -        WARN((i == ttm->num_pages), "Missing get_user_page_done\n");
>> -    }
>> -#endif
>>   }
>>     static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> index 6a70818039dd..a37207011a69 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> @@ -39,6 +39,8 @@
>>     #define AMDGPU_POISON    0xd0bed0be
>>   +struct hmm_range;
>> +
>>   struct amdgpu_gtt_mgr {
>>       struct ttm_resource_manager manager;
>>       struct drm_mm mm;
>> @@ -149,15 +151,19 @@ void amdgpu_ttm_recover_gart(struct 
>> ttm_buffer_object *tbo);
>>   uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev, 
>> uint32_t type);
>>     #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page 
>> **pages);
>> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm);
>> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page 
>> **pages,
>> +                 struct hmm_range **range);
>> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
>> +                       struct hmm_range *range);
>>   #else
>>   static inline int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo,
>> -                           struct page **pages)
>> +                           struct page **pages,
>> +                           struct hmm_range **range)
>>   {
>>       return -EPERM;
>>   }
>> -static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt 
>> *ttm)
>> +static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt 
>> *ttm,
>> +                             struct hmm_range *range)
>>   {
>>       return false;
>>   }



More information about the amd-gfx mailing list