[PATCH v3 2/2] drm/amdgpu: unref the bo after job submit

Fri Mar 13 14:20:44 UTC 2020

Am 13.03.20 um 15:07 schrieb xinhui pan:
> Otherwise we might free BOs before job has completed.
> We add the fence to BO after commit, so free BOs after that.

I'm not sure if this is necessary, but probably better save than sorry.

Some comments below.

>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Cc: Felix Kuehling <Felix.Kuehling at amd.com>
> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 38 +++++++++++++++++++-------
>   1 file changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 809ca6e8f40f..605a1bb40280 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -942,13 +942,17 @@ static int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>    *
>    * @entry: PDE to free
>    */
> -static void amdgpu_vm_free_table(struct amdgpu_vm_pt *entry)
> +static void amdgpu_vm_free_table(struct amdgpu_vm_pt *entry,
> +					struct list_head *head)
>   {
>   	if (entry->base.bo) {
>   		entry->base.bo->vm_bo = NULL;
>   		list_del(&entry->base.vm_status);
> -		amdgpu_bo_unref(&entry->base.bo->shadow);
> -		amdgpu_bo_unref(&entry->base.bo);
> +		if (!head) {
> +			amdgpu_bo_unref(&entry->base.bo->shadow);
> +			amdgpu_bo_unref(&entry->base.bo);
> +		} else
> +			list_add(&entry->base.vm_status, head);

Instead of adding a parameter make this a new state in the VM. Something 
like vm->zombies or something similar.

>   	}
>   	kvfree(entry->entries);
>   	entry->entries = NULL;
> @@ -965,7 +969,8 @@ static void amdgpu_vm_free_table(struct amdgpu_vm_pt *entry)
>    */
>   static void amdgpu_vm_free_pts(struct amdgpu_device *adev,
>   			       struct amdgpu_vm *vm,
> -			       struct amdgpu_vm_pt_cursor *start)
> +			       struct amdgpu_vm_pt_cursor *start,
> +				   struct list_head *head)
>   {
>   	struct amdgpu_vm_pt_cursor cursor;
>   	struct amdgpu_vm_pt *entry;
> @@ -973,10 +978,10 @@ static void amdgpu_vm_free_pts(struct amdgpu_device *adev,
>   	vm->bulk_moveable = false;
>   
>   	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
> -		amdgpu_vm_free_table(entry);
> +		amdgpu_vm_free_table(entry, head);
>   
>   	if (start)
> -		amdgpu_vm_free_table(start->entry);
> +		amdgpu_vm_free_table(start->entry, head);
>   }
>   
>   /**
> @@ -1428,7 +1433,8 @@ static void amdgpu_vm_fragment(struct amdgpu_vm_update_params *params,
>    */
>   static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params,
>   				 uint64_t start, uint64_t end,
> -				 uint64_t dst, uint64_t flags)
> +				 uint64_t dst, uint64_t flags,
> +				 struct list_head *head)
>   {
>   	struct amdgpu_device *adev = params->adev;
>   	struct amdgpu_vm_pt_cursor cursor;
> @@ -1539,7 +1545,7 @@ static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params,
>   			 * completely covered by the range and so potentially still in use.
>   			 */
>   			while (cursor.pfn < frag_start) {
> -				amdgpu_vm_free_pts(adev, params->vm, &cursor);
> +				amdgpu_vm_free_pts(adev, params->vm, &cursor, head);
>   				amdgpu_vm_pt_next(adev, &cursor);
>   			}
>   
> @@ -1583,6 +1589,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
>   	struct amdgpu_vm_update_params params;
>   	enum amdgpu_sync_mode sync_mode;
>   	int r;
> +	struct list_head head;
>   
>   	memset(&params, 0, sizeof(params));
>   	params.adev = adev;
> @@ -1590,6 +1597,8 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
>   	params.direct = direct;
>   	params.pages_addr = pages_addr;
>   
> +	INIT_LIST_HEAD(&head);
> +
>   	/* Implicitly sync to command submissions in the same VM before
>   	 * unmapping. Sync to moving fences before mapping.
>   	 */
> @@ -1617,13 +1626,22 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
>   	if (r)
>   		goto error_unlock;
>   
> -	r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags);
> +	r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags, &head);
>   	if (r)
>   		goto error_unlock;
>   
>   	r = vm->update_funcs->commit(&params, fence);
>   
>   error_unlock:
> +	while (!list_empty(&head)) {
> +		struct amdgpu_vm_pt *entry = list_first_entry(&head,
> +				struct amdgpu_vm_pt, base.vm_status);
> +		list_del(&entry->base.vm_status);
> +
> +		amdgpu_bo_unref(&entry->base.bo->shadow);
> +		amdgpu_bo_unref(&entry->base.bo);
> +	}

Maybe put that into a separate function.

> +
>   	amdgpu_vm_eviction_unlock(vm);
>   	return r;
>   }
> @@ -3118,7 +3136,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   		amdgpu_vm_free_mapping(adev, vm, mapping, NULL);
>   	}
>   
> -	amdgpu_vm_free_pts(adev, vm, NULL);
> +	amdgpu_vm_free_pts(adev, vm, NULL, NULL);

And then also call it here.

Regards,
Christian.

>   	amdgpu_bo_unreserve(root);
>   	amdgpu_bo_unref(&root);
>   	WARN_ON(vm->root.base.bo);