[PATCH v11 24/28] drm/amdgpu: resume gfx userqueues

Tue Sep 17 12:30:59 UTC 2024

Am 09.09.24 um 22:06 schrieb Shashank Sharma:
> This patch adds support for userqueue resume. What it typically does is
> this:
> - adds a new delayed work for resuming all the queues.
> - schedules this delayed work from the suspend work.
> - validates the BOs and replaces the eviction fence before resuming all
>    the queues running under this instance of userq manager.
>
> V2: Addressed Christian's review comments:
>      - declare local variables like ret at the bottom.
>      - lock all the object first, then start attaching the new fence.
>      - dont replace old eviction fence, just attach new eviction fence.
>      - no error logs for drm_exec_lock failures
>      - no need to reserve bos after drm_exec_locked
>      - schedule the resume worker immediately (not after 100 ms)
>      - check for NULL BO (Arvind)
>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Cc: Christian Koenig <christian.koenig at amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
> Signed-off-by: Arvind Yadav <arvind.yadav at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 120 ++++++++++++++++++
>   .../gpu/drm/amd/include/amdgpu_userqueue.h    |   1 +
>   2 files changed, 121 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> index 979174f80993..e7f7354e0c0e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> @@ -405,6 +405,122 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
>   	return r;
>   }
>   
> +static int
> +amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
> +{
> +	struct amdgpu_device *adev = uq_mgr->adev;
> +	const struct amdgpu_userq_funcs *userq_funcs;
> +	struct amdgpu_usermode_queue *queue;
> +	int queue_id, ret;
> +
> +	userq_funcs = adev->userq_funcs[AMDGPU_HW_IP_GFX];
> +
> +	/* Resume all the queues for this process */
> +	idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> +		ret = userq_funcs->resume(uq_mgr, queue);
> +		if (ret)
> +			DRM_ERROR("Failed to resume queue %d\n", queue_id);
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +amdgpu_userqueue_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
> +{
> +	struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> +	struct amdgpu_vm *vm = &fpriv->vm;
> +	struct amdgpu_bo_va *bo_va, *tmp;
> +	struct drm_exec exec;
> +	struct amdgpu_bo *bo;
> +	int ret;
> +
> +	drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> +	drm_exec_until_all_locked(&exec) {
> +		ret = amdgpu_vm_lock_pd(vm, &exec, 2);
> +		drm_exec_retry_on_contention(&exec);
> +		if (unlikely(ret)) {
> +			DRM_ERROR("Failed to lock PD\n");

I would drop those error messages in the low level function.

The most likely cause (except for contention) why locking a BO fails is 
because we were interrupted, and for that we actually don't want to 
print anything.

Apart from that I really need to wrap my head around the VM code once 
more, but that here should probably work for now.

Regards,
Christian.

> +			goto unlock_all;
> +		}
> +
> +		/* Lock the done list */
> +		list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status) {
> +			bo = bo_va->base.bo;
> +			if (!bo)
> +				continue;
> +
> +			ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
> +			drm_exec_retry_on_contention(&exec);
> +			if (unlikely(ret))
> +				goto unlock_all;
> +		}
> +
> +		/* Lock the invalidated list */
> +		list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {
> +			bo = bo_va->base.bo;
> +			if (!bo)
> +				continue;
> +
> +			ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
> +			drm_exec_retry_on_contention(&exec);
> +			if (unlikely(ret))
> +				goto unlock_all;
> +		}
> +	}
> +
> +	/* Now validate BOs */
> +	list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {
> +		bo = bo_va->base.bo;
> +		if (!bo)
> +			continue;
> +
> +		ret = amdgpu_userqueue_validate_vm_bo(NULL, bo);
> +		if (ret) {
> +			DRM_ERROR("Failed to validate BO\n");
> +			goto unlock_all;
> +		}
> +	}
> +
> +	/* Handle the moved BOs */
> +	ret = amdgpu_vm_handle_moved(uq_mgr->adev, vm, &exec.ticket);
> +	if (ret) {
> +		DRM_ERROR("Failed to handle moved BOs\n");
> +		goto unlock_all;
> +	}
> +
> +	ret = amdgpu_eviction_fence_replace_fence(fpriv);
> +	if (ret)
> +		DRM_ERROR("Failed to replace eviction fence\n");
> +
> +unlock_all:
> +	drm_exec_fini(&exec);
> +	return ret;
> +}
> +
> +static void amdgpu_userqueue_resume_worker(struct work_struct *work)
> +{
> +	struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work);
> +	int ret;
> +
> +	mutex_lock(&uq_mgr->userq_mutex);
> +
> +	ret = amdgpu_userqueue_validate_bos(uq_mgr);
> +	if (ret) {
> +		DRM_ERROR("Failed to validate BOs to restore\n");
> +		goto unlock;
> +	}
> +
> +	ret = amdgpu_userqueue_resume_all(uq_mgr);
> +	if (ret) {
> +		DRM_ERROR("Failed to resume all queues\n");
> +		goto unlock;
> +	}
> +
> +unlock:
> +	mutex_unlock(&uq_mgr->userq_mutex);
> +}
> +
>   static int
>   amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
>   {
> @@ -486,6 +602,9 @@ amdgpu_userqueue_suspend_worker(struct work_struct *work)
>   	/* Cleanup old eviction fence entry */
>   	amdgpu_eviction_fence_destroy(evf_mgr);
>   
> +	/* Schedule a work to restore userqueue */
> +	schedule_delayed_work(&uq_mgr->resume_work, 0);
> +
>   unlock:
>   	mutex_unlock(&uq_mgr->userq_mutex);
>   }
> @@ -508,6 +627,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_devi
>   	/* This reference is required for suspend work */
>   	fpriv->evf_mgr.ev_fence->uq_mgr = userq_mgr;
>   	INIT_DELAYED_WORK(&userq_mgr->suspend_work, amdgpu_userqueue_suspend_worker);
> +	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userqueue_resume_worker);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
> index 8b3b50fa8b5b..d035b5c2b14b 100644
> --- a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
> +++ b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
> @@ -76,6 +76,7 @@ struct amdgpu_userq_mgr {
>   	struct amdgpu_device		*adev;
>   
>   	struct delayed_work		suspend_work;
> +	struct delayed_work		resume_work;
>   	int num_userqs;
>   };
>