[PATCH 2/2] drm/amdgpu: improve VM state machine documentation

Felix Kuehling felix.kuehling at amd.com
Fri Aug 31 23:51:19 UTC 2018


Thanks for this. A few comments and a question inline.

On 2018-08-31 09:27 AM, Christian König wrote:
> Since we have a lot of FAQ on the VM state machine try to improve the
> documentation by adding functions for each state move.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 107 ++++++++++++++++++++++++---------
>  1 file changed, 79 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index a9275a99d793..40c22635fefd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -204,6 +204,69 @@ static unsigned amdgpu_vm_bo_size(struct amdgpu_device *adev, unsigned level)
>  	return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
>  }
>  
> +/**
> + * amdgpu_vm_bo_evicted - vm_bo is evicted
> + *
> + * @vm_bo: vm_bo which is evicted
> + *
> + * State for PDs/PTs and per VM BOs which are not at the location they should
> + * be.
> + */
> +static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
> +{
> +	struct amdgpu_vm *vm = vm_bo->vm;
> +	struct amdgpu_bo *bo = vm_bo->bo;
> +
> +	vm_bo->moved = true;
> +	if (bo->tbo.type == ttm_bo_type_kernel)
> +		list_move(&vm_bo->vm_status, &vm->evicted);
> +	else
> +		list_move_tail(&vm_bo->vm_status, &vm->evicted);
> +}
> +
> +/**
> + * amdgpu_vm_bo_relocated - vm_bo is reloacted
> + *
> + * @vm_bo: vm_bo which is relocated
> + *
> + * State for PDs/PTs which needs to update their parent PD.
> + */
> +static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
> +{
> +	list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
> +}
> +
> +/**
> + * amdgpu_vm_bo_moved - vm_bo is moved
> + *
> + * @vm_bo: vm_bo which is moved
> + *
> + * State for per VM and normal BOs which are moved, but that change is not yet
> + * reflected in the page tables.

I have a question here. Why does amdgpu_cs_vm_handling call
amdgpu_vm_bo_update manually for its BO list entries? Wouldn't it be
enough to just call amdgpu_vm_handle_moved?

> + */
> +static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
> +{
> +	struct amdgpu_vm *vm = vm_bo->vm;
> +
> +	spin_lock(&vm->moved_lock);
> +	list_move(&vm_bo->vm_status, &vm->moved);
> +	spin_unlock(&vm->moved_lock);

If vm->moved_lock protects the moved list, do we also need to take it
whenever something is moved from that list? That could potentially be
any list_move operation that uses vm_bo->vm_status. I found one case
below where that may not be handled correctly.

> +}
> +
> +/**
> + * amdgpu_vm_bo_idle - vm_bo is idle
> + *
> + * @vm_bo: vm_bo which is now idle
> + *
> + * State for PDs/PTs and per VM BOs which have gone through the state machine
> + * and are now idle.
> + */
> +static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
> +{
> +	list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
> +	vm_bo->moved = false;
> +}
> +
>  /**
>   * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
>   *
> @@ -232,9 +295,9 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
>  
>  	vm->bulk_moveable = false;
>  	if (bo->tbo.type == ttm_bo_type_kernel)
> -		list_move(&base->vm_status, &vm->relocated);
> +		amdgpu_vm_bo_relocated(base);
>  	else
> -		list_move(&base->vm_status, &vm->idle);
> +		amdgpu_vm_bo_idle(base);
>  
>  	if (bo->preferred_domains &
>  	    amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
> @@ -245,8 +308,7 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
>  	 * is currently evicted. add the bo to the evicted list to make sure it
>  	 * is validated on next vm use to avoid fault.
>  	 * */
> -	list_move_tail(&base->vm_status, &vm->evicted);
> -	base->moved = true;
> +	amdgpu_vm_bo_evicted(base);
>  }
>  
>  /**
> @@ -342,9 +404,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  			break;
>  
>  		if (bo->tbo.type != ttm_bo_type_kernel) {
> -			spin_lock(&vm->moved_lock);
> -			list_move(&bo_base->vm_status, &vm->moved);
> -			spin_unlock(&vm->moved_lock);
> +			amdgpu_vm_bo_moved(bo_base);
>  		} else {
>  			if (vm->use_cpu_for_update)
>  				r = amdgpu_bo_kmap(bo, NULL);
> @@ -352,7 +412,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  				r = amdgpu_ttm_alloc_gart(&bo->tbo);
>  			if (r)
>  				break;
> -			list_move(&bo_base->vm_status, &vm->relocated);
> +			amdgpu_vm_bo_relocated(bo_base);
>  		}
>  	}
>  
> @@ -1123,8 +1183,7 @@ int amdgpu_vm_update_directories(struct amdgpu_device *adev,
>  		bo_base = list_first_entry(&vm->relocated,
>  					   struct amdgpu_vm_bo_base,
>  					   vm_status);
> -		bo_base->moved = false;
> -		list_move(&bo_base->vm_status, &vm->idle);
> +		amdgpu_vm_bo_idle(bo_base);
>  
>  		bo = bo_base->bo->parent;
>  		if (!bo)
> @@ -1243,7 +1302,7 @@ static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
>  		if (entry->huge) {
>  			/* Add the entry to the relocated list to update it. */
>  			entry->huge = false;
> -			list_move(&entry->base.vm_status, &p->vm->relocated);
> +			amdgpu_vm_bo_relocated(&entry->base);
>  		}
>  		return;
>  	}
> @@ -1746,9 +1805,9 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>  		uint32_t mem_type = bo->tbo.mem.mem_type;
>  
>  		if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(mem_type)))
> -			list_add_tail(&bo_va->base.vm_status, &vm->evicted);
> +			amdgpu_vm_bo_evicted(&bo_va->base);
>  		else
> -			list_add(&bo_va->base.vm_status, &vm->idle);
> +			amdgpu_vm_bo_idle(&bo_va->base);

There is a small change in behaviour here for clearing
bo_va->base.moved. Not sure if it matters.

>  	}
>  
>  	list_splice_init(&bo_va->invalids, &bo_va->valids);
> @@ -2472,28 +2531,20 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
>  
>  	list_for_each_entry(bo_base, &bo->va, bo_list) {
>  		struct amdgpu_vm *vm = bo_base->vm;
> -		bool was_moved = bo_base->moved;
>  
> -		bo_base->moved = true;
>  		if (evicted && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
> -			if (bo->tbo.type == ttm_bo_type_kernel)
> -				list_move(&bo_base->vm_status, &vm->evicted);
> -			else
> -				list_move_tail(&bo_base->vm_status,
> -					       &vm->evicted);
> +			amdgpu_vm_bo_evicted(bo_base);

I think here it's possible that the BO was on the moved list. I think
that means amdgpu_vm_bo_evicted should take the moved_lock just in case.

Regards,
  Felix

>  			continue;
>  		}
>  
> -		if (was_moved)
> +		if (bo_base->moved)
>  			continue;
>  
> -		if (bo->tbo.type == ttm_bo_type_kernel) {
> -			list_move(&bo_base->vm_status, &vm->relocated);
> -		} else {
> -			spin_lock(&bo_base->vm->moved_lock);
> -			list_move(&bo_base->vm_status, &vm->moved);
> -			spin_unlock(&bo_base->vm->moved_lock);
> -		}
> +		bo_base->moved = true;
> +		if (bo->tbo.type == ttm_bo_type_kernel)
> +			amdgpu_vm_bo_relocated(bo_base);
> +		else
> +			amdgpu_vm_bo_moved(bo_base);
>  	}
>  }
>  



More information about the amd-gfx mailing list