[RFC PATCH 1/1] drm/amdgpu: Protect the amdgpu_bo_list list with a mutex

Christian König christian.koenig at amd.com
Thu Jul 14 07:58:19 UTC 2022


Hi guys,

I need this fixed for the gang submit branch. So I'm going to pick up 
this patch and make the necessary modifications to make it stable.

Thanks,
Christian.

Am 12.07.22 um 10:07 schrieb Christian König:
> Am 12.07.22 um 07:39 schrieb Luben Tuikov:
>> Protect the struct amdgpu_bo_list with a mutex. This is used during 
>> command
>> submission in order to avoid buffer object corruption as recorded in
>> the link below.
>>
>> Suggested-by: Christian König <christian.koenig at amd.com>
>> Cc: Alex Deucher <Alexander.Deucher at amd.com>
>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com>
>> Cc: Vitaly Prosyak <Vitaly.Prosyak at amd.com>
>> Link: 
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues%2F2048&data=05%7C01%7Cchristian.koenig%40amd.com%7C1fc0be908ec3423f396c08da63dd8b6d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637932100428991697%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=KzScTABB44b7TxNlbCe2gNU%2F%2F9om5JyvK88SeJ5SBus%3D&reserved=0
>> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  3 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |  4 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      | 31 +++++++++++++++++++--
>>   3 files changed, 35 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> index 714178f1b6c6ed..2168163aad2d38 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> @@ -40,7 +40,7 @@ static void amdgpu_bo_list_free_rcu(struct rcu_head 
>> *rcu)
>>   {
>>       struct amdgpu_bo_list *list = container_of(rcu, struct 
>> amdgpu_bo_list,
>>                              rhead);
>> -
>> +    mutex_destroy(&list->bo_list_mutex);
>>       kvfree(list);
>>   }
>>   @@ -136,6 +136,7 @@ int amdgpu_bo_list_create(struct amdgpu_device 
>> *adev, struct drm_file *filp,
>>         trace_amdgpu_cs_bo_status(list->num_entries, total_size);
>>   +    mutex_init(&list->bo_list_mutex);
>>       *result = list;
>>       return 0;
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> index 044b41f0bfd9ce..717984d4de6858 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> @@ -48,6 +48,10 @@ struct amdgpu_bo_list {
>>       struct amdgpu_bo *oa_obj;
>>       unsigned first_userptr;
>>       unsigned num_entries;
>> +
>> +    /* Protect access during command submission.
>> +     */
>> +    struct mutex bo_list_mutex;
>>   };
>>     int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 36ac1f1d11e6b4..0b2932c20ec777 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -517,6 +517,8 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>               return r;
>>       }
>>   +    mutex_lock(&p->bo_list->bo_list_mutex);
>
> That lock/unlock placement is not correct and probably the reason why 
> you still run into trouble with this patch.
>
> You need to grab the lock before the call to amdgpu_bo_list_get_list() 
> and drop it either after a call to ttm_eu_backoff_reservation() or 
> ttm_eu_fence_buffer_objects().
>
> If the lock is dropped anywhere in between we would have list 
> corruption again.
>
> Regards,
> Christian.
>
>> +
>>       /* One for TTM and one for the CS job */
>>       amdgpu_bo_list_for_each_entry(e, p->bo_list)
>>           e->tv.num_shared = 2;
>> @@ -544,6 +546,7 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>           if (!e->user_pages) {
>>               DRM_ERROR("kvmalloc_array failure\n");
>>               r = -ENOMEM;
>> +            mutex_unlock(&p->bo_list->bo_list_mutex);
>>               goto out_free_user_pages;
>>           }
>>   @@ -551,6 +554,7 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>           if (r) {
>>               kvfree(e->user_pages);
>>               e->user_pages = NULL;
>> +            mutex_unlock(&p->bo_list->bo_list_mutex);
>>               goto out_free_user_pages;
>>           }
>>   @@ -568,6 +572,7 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>       if (unlikely(r != 0)) {
>>           if (r != -ERESTARTSYS)
>>               DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
>> +        mutex_unlock(&p->bo_list->bo_list_mutex);
>>           goto out_free_user_pages;
>>       }
>>   @@ -580,11 +585,14 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>               e->chain = dma_fence_chain_alloc();
>>               if (!e->chain) {
>>                   r = -ENOMEM;
>> + mutex_unlock(&p->bo_list->bo_list_mutex);
>>                   goto error_validate;
>>               }
>>           }
>>       }
>>   +    mutex_unlock(&p->bo_list->bo_list_mutex);
>
>
>
>
>
>
>> +
>>       /* Move fence waiting after getting reservation lock of
>>        * PD root. Then there is no need on a ctx mutex lock.
>>        */
>> @@ -607,6 +615,7 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>           goto error_validate;
>>       }
>>   +    mutex_lock(&p->bo_list->bo_list_mutex);
>>       r = amdgpu_cs_list_validate(p, &duplicates);
>>       if (r)
>>           goto error_validate;
>> @@ -614,6 +623,7 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>       r = amdgpu_cs_list_validate(p, &p->validated);
>>       if (r)
>>           goto error_validate;
>> +    mutex_unlock(&p->bo_list->bo_list_mutex);
>>         amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
>>                        p->bytes_moved_vis);
>> @@ -644,15 +654,18 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>     error_validate:
>>       if (r) {
>> +        mutex_lock(&p->bo_list->bo_list_mutex);
>>           amdgpu_bo_list_for_each_entry(e, p->bo_list) {
>>               dma_fence_chain_free(e->chain);
>>               e->chain = NULL;
>>           }
>>           ttm_eu_backoff_reservation(&p->ticket, &p->validated);
>> +        mutex_unlock(&p->bo_list->bo_list_mutex);
>>       }
>>     out_free_user_pages:
>>       if (r) {
>> +        mutex_lock(&p->bo_list->bo_list_mutex);
>>           amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
>>               struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>>   @@ -662,6 +675,7 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>               kvfree(e->user_pages);
>>               e->user_pages = NULL;
>>           }
>> +        mutex_unlock(&p->bo_list->bo_list_mutex);
>>       }
>>       return r;
>>   }
>> @@ -704,6 +718,7 @@ static void amdgpu_cs_parser_fini(struct 
>> amdgpu_cs_parser *parser, int error,
>>       if (error && backoff) {
>>           struct amdgpu_bo_list_entry *e;
>>   + mutex_lock(&parser->bo_list->bo_list_mutex);
>>           amdgpu_bo_list_for_each_entry(e, parser->bo_list) {
>>               dma_fence_chain_free(e->chain);
>>               e->chain = NULL;
>> @@ -711,6 +726,7 @@ static void amdgpu_cs_parser_fini(struct 
>> amdgpu_cs_parser *parser, int error,
>>             ttm_eu_backoff_reservation(&parser->ticket,
>>                          &parser->validated);
>> + mutex_unlock(&parser->bo_list->bo_list_mutex);
>>       }
>>         for (i = 0; i < parser->num_post_deps; i++) {
>> @@ -839,6 +855,7 @@ static int amdgpu_cs_vm_handling(struct 
>> amdgpu_cs_parser *p)
>>               return r;
>>       }
>>   +    mutex_lock(&p->bo_list->bo_list_mutex);
>>       amdgpu_bo_list_for_each_entry(e, p->bo_list) {
>>           /* ignore duplicates */
>>           bo = ttm_to_amdgpu_bo(e->tv.bo);
>> @@ -850,13 +867,18 @@ static int amdgpu_cs_vm_handling(struct 
>> amdgpu_cs_parser *p)
>>               continue;
>>             r = amdgpu_vm_bo_update(adev, bo_va, false);
>> -        if (r)
>> +        if (r) {
>> +            mutex_unlock(&p->bo_list->bo_list_mutex);
>>               return r;
>> +        }
>>             r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
>> -        if (r)
>> +        if (r) {
>> +            mutex_unlock(&p->bo_list->bo_list_mutex);
>>               return r;
>> +        }
>>       }
>> +    mutex_unlock(&p->bo_list->bo_list_mutex);
>>         r = amdgpu_vm_handle_moved(adev, vm);
>>       if (r)
>> @@ -874,6 +896,7 @@ static int amdgpu_cs_vm_handling(struct 
>> amdgpu_cs_parser *p)
>>         if (amdgpu_vm_debug) {
>>           /* Invalidate all BOs to test for userspace bugs */
>> +        mutex_lock(&p->bo_list->bo_list_mutex);
>>           amdgpu_bo_list_for_each_entry(e, p->bo_list) {
>>               struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>>   @@ -883,6 +906,7 @@ static int amdgpu_cs_vm_handling(struct 
>> amdgpu_cs_parser *p)
>>                 amdgpu_vm_bo_invalidate(adev, bo, false);
>>           }
>> +        mutex_unlock(&p->bo_list->bo_list_mutex);
>>       }
>>         return amdgpu_cs_sync_rings(p);
>> @@ -1249,6 +1273,7 @@ static int amdgpu_cs_submit(struct 
>> amdgpu_cs_parser *p,
>>        * added to BOs.
>>        */
>>       mutex_lock(&p->adev->notifier_lock);
>> +    mutex_lock(&p->bo_list->bo_list_mutex);
>>         /* If userptr are invalidated after amdgpu_cs_parser_bos(), 
>> return
>>        * -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
>> @@ -1308,12 +1333,14 @@ static int amdgpu_cs_submit(struct 
>> amdgpu_cs_parser *p,
>>       }
>>         ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, 
>> p->fence);
>> +    mutex_unlock(&p->bo_list->bo_list_mutex);
>>       mutex_unlock(&p->adev->notifier_lock);
>>         return 0;
>>     error_abort:
>>       drm_sched_job_cleanup(&job->base);
>> +    mutex_unlock(&p->bo_list->bo_list_mutex);
>>       mutex_unlock(&p->adev->notifier_lock);
>>     error_unlock:
>



More information about the amd-gfx mailing list