[PATCH 2/4] drm/amdgpu: fix userptr HMM range handling
Felix Kuehling
felix.kuehling at amd.com
Tue Nov 15 15:36:43 UTC 2022
Am 2022-11-15 um 08:37 schrieb Christian König:
> Am 10.11.22 um 22:55 schrieb Felix Kuehling:
>>
>> Am 2022-11-10 um 08:00 schrieb Christian König:
>>> The basic problem here is that it's not allowed to page fault while
>>> holding the reservation lock.
>>>
>>> So it can happen that multiple processes try to validate an userptr
>>> at the same time.
>>>
>>> Work around that by putting the HMM range object into the mutex
>>> protected bo list for now.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> CC: stable at vger.kernel.org
>>> ---
>>> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +++--
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c | 1 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h | 3 ++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 8 +--
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 6 ++-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 50
>>> +++++--------------
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 14 ++++--
>>> 7 files changed, 43 insertions(+), 51 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index c5c9bfa2772e..83659e6419a8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -940,6 +940,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr,
>>> struct amdkfd_process_info *process_info = mem->process_info;
>>> struct amdgpu_bo *bo = mem->bo;
>>> struct ttm_operation_ctx ctx = { true, false };
>>> + struct hmm_range *range;
>>
>> I'd feel better if these local hmm_range pointers here and in
>> update_invalid_user_pages and amdgpu_gem_userptr_ioctl were
>> initialized to NULL. amdgpu_ttm_tt_get_user_pages leaves it
>> uninitialized in case of errors and amdgpu_ttm_tt_get_user_pages_done
>> checks for !range.
>
> I've opted to initializing range to NULL in case of an error.
>
>>
>> With that fixed, the patch is
>>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
> Does that still counts?
If you don't mind, please send the updated patch again. It's always good
to have a second pair of eyes on error handling.
Thanks,
Felix
>
> Thanks,
> Christian.
>
>>
>>
>>> int ret = 0;
>>> mutex_lock(&process_info->lock);
>>> @@ -969,7 +970,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr,
>>> return 0;
>>> }
>>> - ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> + ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
>>> &range);
>>> if (ret) {
>>> pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>> goto unregister_out;
>>> @@ -987,7 +988,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr,
>>> amdgpu_bo_unreserve(bo);
>>> release_out:
>>> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>>> unregister_out:
>>> if (ret)
>>> amdgpu_mn_unregister(bo);
>>> @@ -2319,6 +2320,8 @@ static int update_invalid_user_pages(struct
>>> amdkfd_process_info *process_info,
>>> /* Go through userptr_inval_list and update any invalid
>>> user_pages */
>>> list_for_each_entry(mem, &process_info->userptr_inval_list,
>>> validate_list.head) {
>>> + struct hmm_range *range;
>>> +
>>> invalid = atomic_read(&mem->invalid);
>>> if (!invalid)
>>> /* BO hasn't been invalidated since the last
>>> @@ -2329,7 +2332,8 @@ static int update_invalid_user_pages(struct
>>> amdkfd_process_info *process_info,
>>> bo = mem->bo;
>>> /* Get updated user pages */
>>> - ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> + ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
>>> + &range);
>>> if (ret) {
>>> pr_debug("Failed %d to get user pages\n", ret);
>>> @@ -2348,7 +2352,7 @@ static int update_invalid_user_pages(struct
>>> amdkfd_process_info *process_info,
>>> * FIXME: Cannot ignore the return code, must hold
>>> * notifier_lock
>>> */
>>> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>>> }
>>> /* Mark the BO as valid unless it was invalidated
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> index 2168163aad2d..252a876b0725 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> @@ -209,6 +209,7 @@ void amdgpu_bo_list_get_list(struct
>>> amdgpu_bo_list *list,
>>> list_add_tail(&e->tv.head, &bucket[priority]);
>>> e->user_pages = NULL;
>>> + e->range = NULL;
>>> }
>>> /* Connect the sorted buckets in the output list. */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> index 9caea1688fc3..e4d78491bcc7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> @@ -26,6 +26,8 @@
>>> #include <drm/ttm/ttm_execbuf_util.h>
>>> #include <drm/amdgpu_drm.h>
>>> +struct hmm_range;
>>> +
>>> struct amdgpu_device;
>>> struct amdgpu_bo;
>>> struct amdgpu_bo_va;
>>> @@ -36,6 +38,7 @@ struct amdgpu_bo_list_entry {
>>> struct amdgpu_bo_va *bo_va;
>>> uint32_t priority;
>>> struct page **user_pages;
>>> + struct hmm_range *range;
>>> bool user_invalidated;
>>> };
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index d371000a5727..7f9cedd8e157 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -910,7 +910,7 @@ static int amdgpu_cs_parser_bos(struct
>>> amdgpu_cs_parser *p,
>>> goto out_free_user_pages;
>>> }
>>> - r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages);
>>> + r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages,
>>> &e->range);
>>> if (r) {
>>> kvfree(e->user_pages);
>>> e->user_pages = NULL;
>>> @@ -988,9 +988,10 @@ static int amdgpu_cs_parser_bos(struct
>>> amdgpu_cs_parser *p,
>>> if (!e->user_pages)
>>> continue;
>>> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
>>> kvfree(e->user_pages);
>>> e->user_pages = NULL;
>>> + e->range = NULL;
>>> }
>>> mutex_unlock(&p->bo_list->bo_list_mutex);
>>> return r;
>>> @@ -1265,7 +1266,8 @@ static int amdgpu_cs_submit(struct
>>> amdgpu_cs_parser *p,
>>> amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
>>> struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>>> - r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> + r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm,
>>> e->range);
>>> + e->range = NULL;
>>> }
>>> if (r) {
>>> r = -EAGAIN;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index 111484ceb47d..91571b1324f2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -378,6 +378,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device
>>> *dev, void *data,
>>> struct amdgpu_device *adev = drm_to_adev(dev);
>>> struct drm_amdgpu_gem_userptr *args = data;
>>> struct drm_gem_object *gobj;
>>> + struct hmm_range *range;
>>> struct amdgpu_bo *bo;
>>> uint32_t handle;
>>> int r;
>>> @@ -418,7 +419,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device
>>> *dev, void *data,
>>> goto release_object;
>>> if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE) {
>>> - r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> + r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
>>> + &range);
>>> if (r)
>>> goto release_object;
>>> @@ -441,7 +443,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device
>>> *dev, void *data,
>>> user_pages_done:
>>> if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE)
>>> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
>>> release_object:
>>> drm_gem_object_put(gobj);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> index 76a8ebfc9e71..a56d28bd23be 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> @@ -642,9 +642,6 @@ struct amdgpu_ttm_tt {
>>> struct task_struct *usertask;
>>> uint32_t userflags;
>>> bool bound;
>>> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>>> - struct hmm_range *range;
>>> -#endif
>>> };
>>> #define ttm_to_amdgpu_ttm_tt(ptr) container_of(ptr, struct
>>> amdgpu_ttm_tt, ttm)
>>> @@ -657,7 +654,8 @@ struct amdgpu_ttm_tt {
>>> * Calling function must call amdgpu_ttm_tt_userptr_range_done()
>>> once and only
>>> * once afterwards to stop HMM tracking
>>> */
>>> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page
>>> **pages)
>>> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page
>>> **pages,
>>> + struct hmm_range **range)
>>> {
>>> struct ttm_tt *ttm = bo->tbo.ttm;
>>> struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
>>> @@ -673,10 +671,6 @@ int amdgpu_ttm_tt_get_user_pages(struct
>>> amdgpu_bo *bo, struct page **pages)
>>> return -EFAULT;
>>> }
>>> - /* Another get_user_pages is running at the same time?? */
>>> - if (WARN_ON(gtt->range))
>>> - return -EFAULT;
>>> -
>>> if (!mmget_not_zero(mm)) /* Happens during process shutdown */
>>> return -ESRCH;
>>> @@ -694,7 +688,7 @@ int amdgpu_ttm_tt_get_user_pages(struct
>>> amdgpu_bo *bo, struct page **pages)
>>> readonly = amdgpu_ttm_tt_is_readonly(ttm);
>>> r = amdgpu_hmm_range_get_pages(&bo->notifier, mm, pages, start,
>>> - ttm->num_pages, >t->range, readonly,
>>> + ttm->num_pages, range, readonly,
>>> true, NULL);
>>> out_unlock:
>>> mmap_read_unlock(mm);
>>> @@ -712,30 +706,24 @@ int amdgpu_ttm_tt_get_user_pages(struct
>>> amdgpu_bo *bo, struct page **pages)
>>> *
>>> * Returns: true if pages are still valid
>>> */
>>> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
>>> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
>>> + struct hmm_range *range)
>>> {
>>> struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
>>> - bool r = false;
>>> - if (!gtt || !gtt->userptr)
>>> + if (!gtt || !gtt->userptr || !range)
>>> return false;
>>> DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%x\n",
>>> gtt->userptr, ttm->num_pages);
>>> - WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns,
>>> - "No user pages to check\n");
>>> + WARN_ONCE(!range->hmm_pfns, "No user pages to check\n");
>>> - if (gtt->range) {
>>> - /*
>>> - * FIXME: Must always hold notifier_lock for this, and must
>>> - * not ignore the return code.
>>> - */
>>> - r = amdgpu_hmm_range_get_pages_done(gtt->range);
>>> - gtt->range = NULL;
>>> - }
>>> -
>>> - return !r;
>>> + /*
>>> + * FIXME: Must always hold notifier_lock for this, and must
>>> + * not ignore the return code.
>>> + */
>>> + return !amdgpu_hmm_range_get_pages_done(range);
>>> }
>>> #endif
>>> @@ -812,20 +800,6 @@ static void
>>> amdgpu_ttm_tt_unpin_userptr(struct ttm_device *bdev,
>>> /* unmap the pages mapped to the device */
>>> dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
>>> sg_free_table(ttm->sg);
>>> -
>>> -#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>>> - if (gtt->range) {
>>> - unsigned long i;
>>> -
>>> - for (i = 0; i < ttm->num_pages; i++) {
>>> - if (ttm->pages[i] !=
>>> - hmm_pfn_to_page(gtt->range->hmm_pfns[i]))
>>> - break;
>>> - }
>>> -
>>> - WARN((i == ttm->num_pages), "Missing get_user_page_done\n");
>>> - }
>>> -#endif
>>> }
>>> static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> index 6a70818039dd..a37207011a69 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> @@ -39,6 +39,8 @@
>>> #define AMDGPU_POISON 0xd0bed0be
>>> +struct hmm_range;
>>> +
>>> struct amdgpu_gtt_mgr {
>>> struct ttm_resource_manager manager;
>>> struct drm_mm mm;
>>> @@ -149,15 +151,19 @@ void amdgpu_ttm_recover_gart(struct
>>> ttm_buffer_object *tbo);
>>> uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev,
>>> uint32_t type);
>>> #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>>> -int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page
>>> **pages);
>>> -bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm);
>>> +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page
>>> **pages,
>>> + struct hmm_range **range);
>>> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
>>> + struct hmm_range *range);
>>> #else
>>> static inline int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo,
>>> - struct page **pages)
>>> + struct page **pages,
>>> + struct hmm_range **range)
>>> {
>>> return -EPERM;
>>> }
>>> -static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt
>>> *ttm)
>>> +static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt
>>> *ttm,
>>> + struct hmm_range *range)
>>> {
>>> return false;
>>> }
>
More information about the amd-gfx
mailing list