[PATCH 2/2] drm/amdgpu: improve VM state machine documentation
Felix Kuehling
felix.kuehling at amd.com
Fri Aug 31 23:51:19 UTC 2018
Thanks for this. A few comments and a question inline.
On 2018-08-31 09:27 AM, Christian König wrote:
> Since we have a lot of FAQ on the VM state machine try to improve the
> documentation by adding functions for each state move.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 107 ++++++++++++++++++++++++---------
> 1 file changed, 79 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index a9275a99d793..40c22635fefd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -204,6 +204,69 @@ static unsigned amdgpu_vm_bo_size(struct amdgpu_device *adev, unsigned level)
> return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
> }
>
> +/**
> + * amdgpu_vm_bo_evicted - vm_bo is evicted
> + *
> + * @vm_bo: vm_bo which is evicted
> + *
> + * State for PDs/PTs and per VM BOs which are not at the location they should
> + * be.
> + */
> +static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
> +{
> + struct amdgpu_vm *vm = vm_bo->vm;
> + struct amdgpu_bo *bo = vm_bo->bo;
> +
> + vm_bo->moved = true;
> + if (bo->tbo.type == ttm_bo_type_kernel)
> + list_move(&vm_bo->vm_status, &vm->evicted);
> + else
> + list_move_tail(&vm_bo->vm_status, &vm->evicted);
> +}
> +
> +/**
> + * amdgpu_vm_bo_relocated - vm_bo is reloacted
> + *
> + * @vm_bo: vm_bo which is relocated
> + *
> + * State for PDs/PTs which needs to update their parent PD.
> + */
> +static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
> +{
> + list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
> +}
> +
> +/**
> + * amdgpu_vm_bo_moved - vm_bo is moved
> + *
> + * @vm_bo: vm_bo which is moved
> + *
> + * State for per VM and normal BOs which are moved, but that change is not yet
> + * reflected in the page tables.
I have a question here. Why does amdgpu_cs_vm_handling call
amdgpu_vm_bo_update manually for its BO list entries? Wouldn't it be
enough to just call amdgpu_vm_handle_moved?
> + */
> +static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
> +{
> + struct amdgpu_vm *vm = vm_bo->vm;
> +
> + spin_lock(&vm->moved_lock);
> + list_move(&vm_bo->vm_status, &vm->moved);
> + spin_unlock(&vm->moved_lock);
If vm->moved_lock protects the moved list, do we also need to take it
whenever something is moved from that list? That could potentially be
any list_move operation that uses vm_bo->vm_status. I found one case
below where that may not be handled correctly.
> +}
> +
> +/**
> + * amdgpu_vm_bo_idle - vm_bo is idle
> + *
> + * @vm_bo: vm_bo which is now idle
> + *
> + * State for PDs/PTs and per VM BOs which have gone through the state machine
> + * and are now idle.
> + */
> +static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
> +{
> + list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
> + vm_bo->moved = false;
> +}
> +
> /**
> * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
> *
> @@ -232,9 +295,9 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
>
> vm->bulk_moveable = false;
> if (bo->tbo.type == ttm_bo_type_kernel)
> - list_move(&base->vm_status, &vm->relocated);
> + amdgpu_vm_bo_relocated(base);
> else
> - list_move(&base->vm_status, &vm->idle);
> + amdgpu_vm_bo_idle(base);
>
> if (bo->preferred_domains &
> amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
> @@ -245,8 +308,7 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
> * is currently evicted. add the bo to the evicted list to make sure it
> * is validated on next vm use to avoid fault.
> * */
> - list_move_tail(&base->vm_status, &vm->evicted);
> - base->moved = true;
> + amdgpu_vm_bo_evicted(base);
> }
>
> /**
> @@ -342,9 +404,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> break;
>
> if (bo->tbo.type != ttm_bo_type_kernel) {
> - spin_lock(&vm->moved_lock);
> - list_move(&bo_base->vm_status, &vm->moved);
> - spin_unlock(&vm->moved_lock);
> + amdgpu_vm_bo_moved(bo_base);
> } else {
> if (vm->use_cpu_for_update)
> r = amdgpu_bo_kmap(bo, NULL);
> @@ -352,7 +412,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> r = amdgpu_ttm_alloc_gart(&bo->tbo);
> if (r)
> break;
> - list_move(&bo_base->vm_status, &vm->relocated);
> + amdgpu_vm_bo_relocated(bo_base);
> }
> }
>
> @@ -1123,8 +1183,7 @@ int amdgpu_vm_update_directories(struct amdgpu_device *adev,
> bo_base = list_first_entry(&vm->relocated,
> struct amdgpu_vm_bo_base,
> vm_status);
> - bo_base->moved = false;
> - list_move(&bo_base->vm_status, &vm->idle);
> + amdgpu_vm_bo_idle(bo_base);
>
> bo = bo_base->bo->parent;
> if (!bo)
> @@ -1243,7 +1302,7 @@ static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
> if (entry->huge) {
> /* Add the entry to the relocated list to update it. */
> entry->huge = false;
> - list_move(&entry->base.vm_status, &p->vm->relocated);
> + amdgpu_vm_bo_relocated(&entry->base);
> }
> return;
> }
> @@ -1746,9 +1805,9 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
> uint32_t mem_type = bo->tbo.mem.mem_type;
>
> if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(mem_type)))
> - list_add_tail(&bo_va->base.vm_status, &vm->evicted);
> + amdgpu_vm_bo_evicted(&bo_va->base);
> else
> - list_add(&bo_va->base.vm_status, &vm->idle);
> + amdgpu_vm_bo_idle(&bo_va->base);
There is a small change in behaviour here for clearing
bo_va->base.moved. Not sure if it matters.
> }
>
> list_splice_init(&bo_va->invalids, &bo_va->valids);
> @@ -2472,28 +2531,20 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
>
> list_for_each_entry(bo_base, &bo->va, bo_list) {
> struct amdgpu_vm *vm = bo_base->vm;
> - bool was_moved = bo_base->moved;
>
> - bo_base->moved = true;
> if (evicted && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
> - if (bo->tbo.type == ttm_bo_type_kernel)
> - list_move(&bo_base->vm_status, &vm->evicted);
> - else
> - list_move_tail(&bo_base->vm_status,
> - &vm->evicted);
> + amdgpu_vm_bo_evicted(bo_base);
I think here it's possible that the BO was on the moved list. I think
that means amdgpu_vm_bo_evicted should take the moved_lock just in case.
Regards,
Felix
> continue;
> }
>
> - if (was_moved)
> + if (bo_base->moved)
> continue;
>
> - if (bo->tbo.type == ttm_bo_type_kernel) {
> - list_move(&bo_base->vm_status, &vm->relocated);
> - } else {
> - spin_lock(&bo_base->vm->moved_lock);
> - list_move(&bo_base->vm_status, &vm->moved);
> - spin_unlock(&bo_base->vm->moved_lock);
> - }
> + bo_base->moved = true;
> + if (bo->tbo.type == ttm_bo_type_kernel)
> + amdgpu_vm_bo_relocated(bo_base);
> + else
> + amdgpu_vm_bo_moved(bo_base);
> }
> }
>
More information about the amd-gfx
mailing list