[PATCH v11 24/28] drm/amdgpu: resume gfx userqueues
Sharma, Shashank
shashank.sharma at amd.com
Wed Sep 25 09:15:50 UTC 2024
On 17/09/2024 14:30, Christian König wrote:
> Am 09.09.24 um 22:06 schrieb Shashank Sharma:
>> This patch adds support for userqueue resume. What it typically does is
>> this:
>> - adds a new delayed work for resuming all the queues.
>> - schedules this delayed work from the suspend work.
>> - validates the BOs and replaces the eviction fence before resuming all
>> the queues running under this instance of userq manager.
>>
>> V2: Addressed Christian's review comments:
>> - declare local variables like ret at the bottom.
>> - lock all the object first, then start attaching the new fence.
>> - dont replace old eviction fence, just attach new eviction fence.
>> - no error logs for drm_exec_lock failures
>> - no need to reserve bos after drm_exec_locked
>> - schedule the resume worker immediately (not after 100 ms)
>> - check for NULL BO (Arvind)
>>
>> Cc: Alex Deucher <alexander.deucher at amd.com>
>> Cc: Christian Koenig <christian.koenig at amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>> Signed-off-by: Arvind Yadav <arvind.yadav at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 120 ++++++++++++++++++
>> .../gpu/drm/amd/include/amdgpu_userqueue.h | 1 +
>> 2 files changed, 121 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> index 979174f80993..e7f7354e0c0e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> @@ -405,6 +405,122 @@ int amdgpu_userq_ioctl(struct drm_device *dev,
>> void *data,
>> return r;
>> }
>> +static int
>> +amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
>> +{
>> + struct amdgpu_device *adev = uq_mgr->adev;
>> + const struct amdgpu_userq_funcs *userq_funcs;
>> + struct amdgpu_usermode_queue *queue;
>> + int queue_id, ret;
>> +
>> + userq_funcs = adev->userq_funcs[AMDGPU_HW_IP_GFX];
>> +
>> + /* Resume all the queues for this process */
>> + idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
>> + ret = userq_funcs->resume(uq_mgr, queue);
>> + if (ret)
>> + DRM_ERROR("Failed to resume queue %d\n", queue_id);
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +static int
>> +amdgpu_userqueue_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
>> +{
>> + struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>> + struct amdgpu_vm *vm = &fpriv->vm;
>> + struct amdgpu_bo_va *bo_va, *tmp;
>> + struct drm_exec exec;
>> + struct amdgpu_bo *bo;
>> + int ret;
>> +
>> + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
>> + drm_exec_until_all_locked(&exec) {
>> + ret = amdgpu_vm_lock_pd(vm, &exec, 2);
>> + drm_exec_retry_on_contention(&exec);
>> + if (unlikely(ret)) {
>> + DRM_ERROR("Failed to lock PD\n");
>
> I would drop those error messages in the low level function.
>
> The most likely cause (except for contention) why locking a BO fails
> is because we were interrupted, and for that we actually don't want to
> print anything.
>
> Apart from that I really need to wrap my head around the VM code once
> more, but that here should probably work for now.
Noted, I will remove the error message.
- Shashank
>
> Regards,
> Christian.
>
>> + goto unlock_all;
>> + }
>> +
>> + /* Lock the done list */
>> + list_for_each_entry_safe(bo_va, tmp, &vm->done,
>> base.vm_status) {
>> + bo = bo_va->base.bo;
>> + if (!bo)
>> + continue;
>> +
>> + ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
>> + drm_exec_retry_on_contention(&exec);
>> + if (unlikely(ret))
>> + goto unlock_all;
>> + }
>> +
>> + /* Lock the invalidated list */
>> + list_for_each_entry_safe(bo_va, tmp, &vm->invalidated,
>> base.vm_status) {
>> + bo = bo_va->base.bo;
>> + if (!bo)
>> + continue;
>> +
>> + ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
>> + drm_exec_retry_on_contention(&exec);
>> + if (unlikely(ret))
>> + goto unlock_all;
>> + }
>> + }
>> +
>> + /* Now validate BOs */
>> + list_for_each_entry_safe(bo_va, tmp, &vm->invalidated,
>> base.vm_status) {
>> + bo = bo_va->base.bo;
>> + if (!bo)
>> + continue;
>> +
>> + ret = amdgpu_userqueue_validate_vm_bo(NULL, bo);
>> + if (ret) {
>> + DRM_ERROR("Failed to validate BO\n");
>> + goto unlock_all;
>> + }
>> + }
>> +
>> + /* Handle the moved BOs */
>> + ret = amdgpu_vm_handle_moved(uq_mgr->adev, vm, &exec.ticket);
>> + if (ret) {
>> + DRM_ERROR("Failed to handle moved BOs\n");
>> + goto unlock_all;
>> + }
>> +
>> + ret = amdgpu_eviction_fence_replace_fence(fpriv);
>> + if (ret)
>> + DRM_ERROR("Failed to replace eviction fence\n");
>> +
>> +unlock_all:
>> + drm_exec_fini(&exec);
>> + return ret;
>> +}
>> +
>> +static void amdgpu_userqueue_resume_worker(struct work_struct *work)
>> +{
>> + struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work,
>> resume_work.work);
>> + int ret;
>> +
>> + mutex_lock(&uq_mgr->userq_mutex);
>> +
>> + ret = amdgpu_userqueue_validate_bos(uq_mgr);
>> + if (ret) {
>> + DRM_ERROR("Failed to validate BOs to restore\n");
>> + goto unlock;
>> + }
>> +
>> + ret = amdgpu_userqueue_resume_all(uq_mgr);
>> + if (ret) {
>> + DRM_ERROR("Failed to resume all queues\n");
>> + goto unlock;
>> + }
>> +
>> +unlock:
>> + mutex_unlock(&uq_mgr->userq_mutex);
>> +}
>> +
>> static int
>> amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
>> {
>> @@ -486,6 +602,9 @@ amdgpu_userqueue_suspend_worker(struct
>> work_struct *work)
>> /* Cleanup old eviction fence entry */
>> amdgpu_eviction_fence_destroy(evf_mgr);
>> + /* Schedule a work to restore userqueue */
>> + schedule_delayed_work(&uq_mgr->resume_work, 0);
>> +
>> unlock:
>> mutex_unlock(&uq_mgr->userq_mutex);
>> }
>> @@ -508,6 +627,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr
>> *userq_mgr, struct amdgpu_devi
>> /* This reference is required for suspend work */
>> fpriv->evf_mgr.ev_fence->uq_mgr = userq_mgr;
>> INIT_DELAYED_WORK(&userq_mgr->suspend_work,
>> amdgpu_userqueue_suspend_worker);
>> + INIT_DELAYED_WORK(&userq_mgr->resume_work,
>> amdgpu_userqueue_resume_worker);
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
>> b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
>> index 8b3b50fa8b5b..d035b5c2b14b 100644
>> --- a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
>> +++ b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
>> @@ -76,6 +76,7 @@ struct amdgpu_userq_mgr {
>> struct amdgpu_device *adev;
>> struct delayed_work suspend_work;
>> + struct delayed_work resume_work;
>> int num_userqs;
>> };
>
More information about the amd-gfx
mailing list