[PATCH 1/3] drm/amdgpu: fix userq VM validation v2
Alex Deucher
alexdeucher at gmail.com
Thu Aug 28 14:29:44 UTC 2025
On Thu, Aug 28, 2025 at 5:41 AM Christian König
<ckoenig.leichtzumerken at gmail.com> wrote:
>
> That was actually complete nonsense and not validating the BOs
> at all. The code just cleared all VM areas were it couldn't grab the
> lock for a BO.
>
> Try to fix this. Only compile tested at the moment.
>
> v2: fix fence slot reservation as well as pointed out by Sunil.
> also validate PDs, PTs, per VM BOs and update PDEs
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 118 ++++++++++------------
> 1 file changed, 56 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 424831997cb1..11edad1951c7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -545,108 +545,102 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
> return ret;
> }
>
> +static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo)
I would rename this amdgpu_userq_bo_validate() for consistency (e.g.,
amdgpu_cs_bo_validate,()) and to make it clearer what it is doing.
> +{
> + struct ttm_operation_ctx ctx = { false, false };
> +
> + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
> + return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> +}
> +
> static int
> -amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
> +amdgpu_userq_validate_bos(struct amdgpu_device *adev, struct drm_exec *exec,
> + struct amdgpu_vm *vm)
> {
> struct ttm_operation_ctx ctx = { false, false };
> + struct amdgpu_bo_va *bo_va;
> + struct amdgpu_bo *bo;
> int ret;
>
> - amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
> + spin_lock(&vm->status_lock);
> + while (!list_empty(&vm->invalidated)) {
> + bo_va = list_first_entry(&vm->invalidated,
> + struct amdgpu_bo_va,
> + base.vm_status);
> + spin_unlock(&vm->status_lock);
>
> - ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> - if (ret)
> - DRM_ERROR("Fail to validate\n");
> + bo = bo_va->base.bo;
> + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
> + if (unlikely(ret))
> + return ret;
>
> - return ret;
> + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
> + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> + if (ret)
> + return ret;
> +
> + /* This moves the bo_va to the done list */
> + ret = amdgpu_vm_bo_update(adev, bo_va, false);
> + if (ret)
> + return ret;
> +
> + spin_lock(&vm->status_lock);
> + }
> + spin_unlock(&vm->status_lock);
> +
> + return 0;
> }
>
> static int
> -amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
> +amdgpu_userq_update_vm(struct amdgpu_userq_mgr *uq_mgr)
Please add some comments to this function to make it clear what each
step is doing and why we are doing it. Many of these functions have
very similar names and it's hard to follow what they are all doing
without double checking their implementations. It would also be
helpful to plug in a comment about where we should plug in usr ptr
checks. amdgpu_cs_parser_bos() could use similar comments if you have
a chance.
Other than that, the patch looks good to me.
Alex
> {
> struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> - struct amdgpu_vm *vm = &fpriv->vm;
> struct amdgpu_device *adev = uq_mgr->adev;
> + struct amdgpu_vm *vm = &fpriv->vm;
> struct amdgpu_bo_va *bo_va;
> - struct ww_acquire_ctx *ticket;
> struct drm_exec exec;
> struct amdgpu_bo *bo;
> - struct dma_resv *resv;
> - bool clear, unlock;
> int ret = 0;
>
> drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> drm_exec_until_all_locked(&exec) {
> - ret = amdgpu_vm_lock_pd(vm, &exec, 2);
> + ret = amdgpu_vm_lock_pd(vm, &exec, 1);
> drm_exec_retry_on_contention(&exec);
> if (unlikely(ret)) {
> drm_file_err(uq_mgr->file, "Failed to lock PD\n");
> goto unlock_all;
> }
>
> - /* Lock the done list */
> list_for_each_entry(bo_va, &vm->done, base.vm_status) {
> bo = bo_va->base.bo;
> if (!bo)
> continue;
>
> - ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
> + ret = drm_exec_prepare_obj(&exec, &bo->tbo.base, 1);
> drm_exec_retry_on_contention(&exec);
> if (unlikely(ret))
> goto unlock_all;
> }
> - }
>
> - spin_lock(&vm->status_lock);
> - while (!list_empty(&vm->moved)) {
> - bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
> - base.vm_status);
> - spin_unlock(&vm->status_lock);
> -
> - /* Per VM BOs never need to bo cleared in the page tables */
> - ret = amdgpu_vm_bo_update(adev, bo_va, false);
> - if (ret)
> + ret = amdgpu_vm_validate(adev, vm, NULL,
> + amdgpu_userq_validate_vm,
> + NULL);
> + if (unlikely(ret))
> goto unlock_all;
> - spin_lock(&vm->status_lock);
> - }
>
> - ticket = &exec.ticket;
> - while (!list_empty(&vm->invalidated)) {
> - bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
> - base.vm_status);
> - resv = bo_va->base.bo->tbo.base.resv;
> - spin_unlock(&vm->status_lock);
> -
> - bo = bo_va->base.bo;
> - ret = amdgpu_userq_validate_vm_bo(NULL, bo);
> - if (ret) {
> - drm_file_err(uq_mgr->file, "Failed to validate BO\n");
> + ret = amdgpu_userq_validate_bos(adev, &exec, vm);
> + drm_exec_retry_on_contention(&exec);
> + if (unlikely(ret))
> goto unlock_all;
> - }
> -
> - /* Try to reserve the BO to avoid clearing its ptes */
> - if (!adev->debug_vm && dma_resv_trylock(resv)) {
> - clear = false;
> - unlock = true;
> - /* The caller is already holding the reservation lock */
> - } else if (dma_resv_locking_ctx(resv) == ticket) {
> - clear = false;
> - unlock = false;
> - /* Somebody else is using the BO right now */
> - } else {
> - clear = true;
> - unlock = false;
> - }
> -
> - ret = amdgpu_vm_bo_update(adev, bo_va, clear);
> + }
>
> - if (unlock)
> - dma_resv_unlock(resv);
> - if (ret)
> - goto unlock_all;
> + ret = amdgpu_vm_handle_moved(adev, vm, NULL);
> + if (ret)
> + goto unlock_all;
>
> - spin_lock(&vm->status_lock);
> - }
> - spin_unlock(&vm->status_lock);
> + ret = amdgpu_vm_update_pdes(adev, vm, false);
> + if (ret)
> + goto unlock_all;
>
> ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
> if (ret)
> @@ -667,7 +661,7 @@ static void amdgpu_userq_restore_worker(struct work_struct *work)
>
> mutex_lock(&uq_mgr->userq_mutex);
>
> - ret = amdgpu_userq_validate_bos(uq_mgr);
> + ret = amdgpu_userq_update_vm(uq_mgr);
> if (ret) {
> drm_file_err(uq_mgr->file, "Failed to validate BOs to restore\n");
> goto unlock;
> --
> 2.43.0
>
More information about the amd-gfx
mailing list