[PATCH v3 2/2] drm/amdgpu: unref the bo after job submit

Christian König christian.koenig at amd.com
Fri Mar 13 18:08:11 UTC 2020


Am 13.03.20 um 15:54 schrieb Pan, Xinhui:
>
>> 2020年3月13日 22:20,Koenig, Christian <Christian.Koenig at amd.com> 写道:
>>
>> Am 13.03.20 um 15:07 schrieb xinhui pan:
>>> Otherwise we might free BOs before job has completed.
>>> We add the fence to BO after commit, so free BOs after that.
>> I'm not sure if this is necessary, but probably better save than sorry.
> without this patch, we hit gmc page fault.

Then this is not the root cause.

See freeing page tables here is save since they can only be reused after 
all jobs completed.

This only fixed a very small window where the PDE would still point to 
the freed PT which might already be reused.

But this can't cause a GMC page fault, it would rather be a security 
problem.

Regards,
Christian.

>
> we have individualized bo resv during bo releasing.
> so any fences added to root PT bo resv after that is actually untested.
>
>> Some comments below.
>>
>>> Cc: Christian König <christian.koenig at amd.com>
>>> Cc: Alex Deucher <alexander.deucher at amd.com>
>>> Cc: Felix Kuehling <Felix.Kuehling at amd.com>
>>> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 38 +++++++++++++++++++-------
>>>   1 file changed, 28 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index 809ca6e8f40f..605a1bb40280 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -942,13 +942,17 @@ static int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>>>    *
>>>    * @entry: PDE to free
>>>    */
>>> -static void amdgpu_vm_free_table(struct amdgpu_vm_pt *entry)
>>> +static void amdgpu_vm_free_table(struct amdgpu_vm_pt *entry,
>>> +					struct list_head *head)
>>>   {
>>>   	if (entry->base.bo) {
>>>   		entry->base.bo->vm_bo = NULL;
>>>   		list_del(&entry->base.vm_status);
>>> -		amdgpu_bo_unref(&entry->base.bo->shadow);
>>> -		amdgpu_bo_unref(&entry->base.bo);
>>> +		if (!head) {
>>> +			amdgpu_bo_unref(&entry->base.bo->shadow);
>>> +			amdgpu_bo_unref(&entry->base.bo);
>>> +		} else
>>> +			list_add(&entry->base.vm_status, head);
>> Instead of adding a parameter make this a new state in the VM. Something like vm->zombies or something similar.
>>
>>>   	}
>>>   	kvfree(entry->entries);
>>>   	entry->entries = NULL;
>>> @@ -965,7 +969,8 @@ static void amdgpu_vm_free_table(struct amdgpu_vm_pt *entry)
>>>    */
>>>   static void amdgpu_vm_free_pts(struct amdgpu_device *adev,
>>>   			       struct amdgpu_vm *vm,
>>> -			       struct amdgpu_vm_pt_cursor *start)
>>> +			       struct amdgpu_vm_pt_cursor *start,
>>> +				   struct list_head *head)
>>>   {
>>>   	struct amdgpu_vm_pt_cursor cursor;
>>>   	struct amdgpu_vm_pt *entry;
>>> @@ -973,10 +978,10 @@ static void amdgpu_vm_free_pts(struct amdgpu_device *adev,
>>>   	vm->bulk_moveable = false;
>>>     	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
>>> -		amdgpu_vm_free_table(entry);
>>> +		amdgpu_vm_free_table(entry, head);
>>>     	if (start)
>>> -		amdgpu_vm_free_table(start->entry);
>>> +		amdgpu_vm_free_table(start->entry, head);
>>>   }
>>>     /**
>>> @@ -1428,7 +1433,8 @@ static void amdgpu_vm_fragment(struct amdgpu_vm_update_params *params,
>>>    */
>>>   static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params,
>>>   				 uint64_t start, uint64_t end,
>>> -				 uint64_t dst, uint64_t flags)
>>> +				 uint64_t dst, uint64_t flags,
>>> +				 struct list_head *head)
>>>   {
>>>   	struct amdgpu_device *adev = params->adev;
>>>   	struct amdgpu_vm_pt_cursor cursor;
>>> @@ -1539,7 +1545,7 @@ static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params,
>>>   			 * completely covered by the range and so potentially still in use.
>>>   			 */
>>>   			while (cursor.pfn < frag_start) {
>>> -				amdgpu_vm_free_pts(adev, params->vm, &cursor);
>>> +				amdgpu_vm_free_pts(adev, params->vm, &cursor, head);
>>>   				amdgpu_vm_pt_next(adev, &cursor);
>>>   			}
>>>   @@ -1583,6 +1589,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
>>>   	struct amdgpu_vm_update_params params;
>>>   	enum amdgpu_sync_mode sync_mode;
>>>   	int r;
>>> +	struct list_head head;
>>>     	memset(&params, 0, sizeof(params));
>>>   	params.adev = adev;
>>> @@ -1590,6 +1597,8 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
>>>   	params.direct = direct;
>>>   	params.pages_addr = pages_addr;
>>>   +	INIT_LIST_HEAD(&head);
>>> +
>>>   	/* Implicitly sync to command submissions in the same VM before
>>>   	 * unmapping. Sync to moving fences before mapping.
>>>   	 */
>>> @@ -1617,13 +1626,22 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
>>>   	if (r)
>>>   		goto error_unlock;
>>>   -	r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags);
>>> +	r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags, &head);
>>>   	if (r)
>>>   		goto error_unlock;
>>>     	r = vm->update_funcs->commit(&params, fence);
>>>     error_unlock:
>>> +	while (!list_empty(&head)) {
>>> +		struct amdgpu_vm_pt *entry = list_first_entry(&head,
>>> +				struct amdgpu_vm_pt, base.vm_status);
>>> +		list_del(&entry->base.vm_status);
>>> +
>>> +		amdgpu_bo_unref(&entry->base.bo->shadow);
>>> +		amdgpu_bo_unref(&entry->base.bo);
>>> +	}
>> Maybe put that into a separate function.
>>
>>> +
>>>   	amdgpu_vm_eviction_unlock(vm);
>>>   	return r;
>>>   }
>>> @@ -3118,7 +3136,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>>   		amdgpu_vm_free_mapping(adev, vm, mapping, NULL);
>>>   	}
>>>   -	amdgpu_vm_free_pts(adev, vm, NULL);
>>> +	amdgpu_vm_free_pts(adev, vm, NULL, NULL);
>> And then also call it here.
>>
>> Regards,
>> Christian.
>>
>>>   	amdgpu_bo_unreserve(root);
>>>   	amdgpu_bo_unref(&root);
>>>   	WARN_ON(vm->root.base.bo);



More information about the amd-gfx mailing list