[PATCH] drm/amdgpu: fix error handling in amdgpu_vm_init

Christian König ckoenig.leichtzumerken at gmail.com
Mon Nov 6 12:31:49 UTC 2023


Am 01.11.23 um 23:00 schrieb Felix Kuehling:
> On 2023-10-31 11:18, Alex Deucher wrote:
>> On Tue, Oct 31, 2023 at 11:12 AM Christian König
>> <ckoenig.leichtzumerken at gmail.com> wrote:
>>> When clearing the root PD fails we need to properly release it again.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> Acked-by: Alex Deucher <alexander.deucher at amd.com>
> Has this been submitted? I see some intermittent failures in the PSDB 
> that may be related to this.

Not yet. But going to push that now

This is just a fix for the error code path. Fixing the underlying 
problem had more importance.

Regards,
Christian.

>
> Regards,
>   Felix
>
>
>
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 31 
>>> +++++++++++++-------------
>>>   1 file changed, 16 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index d72daf15662f..5877f6e9b893 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -2042,7 +2042,8 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, 
>>> long timeout)
>>>    * Returns:
>>>    * 0 for success, error for failure.
>>>    */
>>> -int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm 
>>> *vm, int32_t xcp_id)
>>> +int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>> +                  int32_t xcp_id)
>>>   {
>>>          struct amdgpu_bo *root_bo;
>>>          struct amdgpu_bo_vm *root;
>>> @@ -2061,6 +2062,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, 
>>> struct amdgpu_vm *vm, int32_t xcp
>>>          INIT_LIST_HEAD(&vm->done);
>>>          INIT_LIST_HEAD(&vm->pt_freed);
>>>          INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
>>> +       INIT_KFIFO(vm->faults);
>>>
>>>          /* create scheduler entities for page table updates */
>>>          r = drm_sched_entity_init(&vm->immediate, 
>>> DRM_SCHED_PRIORITY_NORMAL,
>>> @@ -2103,34 +2105,33 @@ int amdgpu_vm_init(struct amdgpu_device 
>>> *adev, struct amdgpu_vm *vm, int32_t xcp
>>>                                  false, &root, xcp_id);
>>>          if (r)
>>>                  goto error_free_delayed;
>>> -       root_bo = &root->bo;
>>> +
>>> +       root_bo = amdgpu_bo_ref(&root->bo);
>>>          r = amdgpu_bo_reserve(root_bo, true);
>>> -       if (r)
>>> -               goto error_free_root;
>>> +       if (r) {
>>> +               amdgpu_bo_unref(&root->shadow);
>>> +               amdgpu_bo_unref(&root_bo);
>>> +               goto error_free_delayed;
>>> +       }
>>>
>>> +       amdgpu_vm_bo_base_init(&vm->root, vm, root_bo);
>>>          r = dma_resv_reserve_fences(root_bo->tbo.base.resv, 1);
>>>          if (r)
>>> -               goto error_unreserve;
>>> -
>>> -       amdgpu_vm_bo_base_init(&vm->root, vm, root_bo);
>>> +               goto error_free_root;
>>>
>>>          r = amdgpu_vm_pt_clear(adev, vm, root, false);
>>>          if (r)
>>> -               goto error_unreserve;
>>> +               goto error_free_root;
>>>
>>>          amdgpu_bo_unreserve(vm->root.bo);
>>> -
>>> -       INIT_KFIFO(vm->faults);
>>> +       amdgpu_bo_unref(&root_bo);
>>>
>>>          return 0;
>>>
>>> -error_unreserve:
>>> -       amdgpu_bo_unreserve(vm->root.bo);
>>> -
>>>   error_free_root:
>>> -       amdgpu_bo_unref(&root->shadow);
>>> +       amdgpu_vm_pt_free_root(adev, vm);
>>> +       amdgpu_bo_unreserve(vm->root.bo);
>>>          amdgpu_bo_unref(&root_bo);
>>> -       vm->root.bo = NULL;
>>>
>>>   error_free_delayed:
>>>          dma_fence_put(vm->last_tlb_flush);
>>> -- 
>>> 2.34.1
>>>



More information about the amd-gfx mailing list