[PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level

Wed Jul 12 04:30:41 UTC 2017

nice improvement, just some nitpick inline, otherwise the patch is 
Reviewed-by: Chunming Zhou <david1.zhou at amd.com>.

On 2017年07月12日 11:56, Felix Kuehling wrote:
> When lots of virtual address spaces is used, there can be thousands
> of page table BOs. amdgpu_vm_update_level iterates over all of them
> recursively. In many cases only a few or none at all need to be
> updated. Minimize unnecessary code execution and memory usage in
> those cases.
>
> This speeds up memory mapping in a synthetic KFD memory mapping
> benchmark by roughly a factor two.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 +++++++++++++++++----------------
>   1 file changed, 55 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index ff5de3a..23b899b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_bo *shadow;
>   	struct amdgpu_ring *ring = NULL;
> -	uint64_t pd_addr, shadow_addr = 0;
> +	uint64_t pd_addr = 0, shadow_addr = 0;
>   	uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>   	uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>   	unsigned count = 0, pt_idx, ndw = 0;
> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   
>   	WARN_ON(vm->use_cpu_for_update && shadow);
>   	if (vm->use_cpu_for_update && !shadow) {
> -		r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
> -		if (r)
> -			return r;
> -		r = amdgpu_vm_bo_wait(adev, parent->bo);
> -		if (unlikely(r)) {
> -			amdgpu_bo_kunmap(parent->bo);
> -			return r;
> -		}
> +		/* Defer kmapping until it's actually needed. Some
> +		 * PDBs may need no update at all
> +		 */
>   		params.func = amdgpu_vm_cpu_set_ptes;
> +		params.ib = (void *)(long)-1;
>   	} else {
> -		if (shadow) {
> -			r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
> -			if (r)
> -				return r;
> -		}
> -		ring = container_of(vm->entity.sched, struct amdgpu_ring,
> -				    sched);
> -
> -		/* padding, etc. */
> -		ndw = 64;
> -
> -		/* assume the worst case */
> -		ndw += parent->last_entry_used * 6;
> -
> -		pd_addr = amdgpu_bo_gpu_offset(parent->bo);
> -
> -		if (shadow) {
> -			shadow_addr = amdgpu_bo_gpu_offset(shadow);
> -			ndw *= 2;
> -		} else {
> -			shadow_addr = 0;
> -		}
> -
> -		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
> -		if (r)
> -			return r;
> -
> -		params.ib = &job->ibs[0];
> +		/* Defer IB allocation until it's actually
> +		 * needed. Some PDBs may need no update at all
> +		 */
> +		params.ib = NULL;
>   		params.func = amdgpu_vm_do_set_ptes;
>   	}
>   
> -
>   	/* walk over the address space and update the directory */
>   	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>   		struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   		if (bo == NULL)
>   			continue;
>   
> -		if (bo->shadow) {
> -			struct amdgpu_bo *pt_shadow = bo->shadow;
> -
> -			r = amdgpu_ttm_bind(&pt_shadow->tbo,
> -					    &pt_shadow->tbo.mem);
> -			if (r)
> -				return r;
> -		}
> -
> -		pt = amdgpu_bo_gpu_offset(bo);
> -		pt = amdgpu_gart_get_vm_pde(adev, pt);
> +		pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
Getting offset from amdgpu_bo_gpu_offset(bo)  is better I think, 
although we can assume pt bo is a known domain.

>   		if (parent->entries[pt_idx].addr == pt)
>   			continue;
>   
>   		parent->entries[pt_idx].addr = pt;
>   
> +		if (!params.ib) {
> +			if (shadow) {
> +				r = amdgpu_ttm_bind(&shadow->tbo,
> +						    &shadow->tbo.mem);
> +				if (r)
> +					return r;
> +			}
> +
> +			ring = container_of(vm->entity.sched,
> +					    struct amdgpu_ring, sched);
> +
> +			/* padding, etc. */
> +			ndw = 64;
> +
> +			/* assume the worst case */
> +			ndw += (parent->last_entry_used - pt_idx) * 6;
> +
> +			pd_addr = parent->bo->tbo.offset;
> +
> +			if (shadow) {
> +				shadow_addr = shadow->tbo.offset;
> +				ndw *= 2;
> +			} else {
> +				shadow_addr = 0;
> +			}
> +			r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
> +			if (r)
> +				return r;
> +
> +			params.ib = &job->ibs[0];
> +		} else if (!pd_addr) {
> +			r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
> +			if (r)
> +				return r;
> +			r = amdgpu_vm_bo_wait(adev, parent->bo);
> +			if (unlikely(r)) {
> +				amdgpu_bo_kunmap(parent->bo);
> +				return r;
> +			}
> +		}
> +
>   		pde = pd_addr + pt_idx * 8;
>   		if (((last_pde + 8 * count) != pde) ||
>   		    ((last_pt + incr * count) != pt) ||
> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   
>   	if (params.func == amdgpu_vm_cpu_set_ptes)
>   		amdgpu_bo_kunmap(parent->bo);
> -	else if (params.ib->length_dw == 0) {
> +	else if (params.ib && params.ib->length_dw == 0) {
>   		amdgpu_job_free(job);
> -	} else {
> +	} else if (params.ib) {
>   		amdgpu_ring_pad_ib(ring, params.ib);
>   		amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>   				 AMDGPU_FENCE_OWNER_VM);
> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   
>   		amdgpu_bo_fence(parent->bo, fence, true);
>   		dma_fence_put(vm->last_dir_update);
> -		vm->last_dir_update = dma_fence_get(fence);
> -		dma_fence_put(fence);
> +		vm->last_dir_update = fence;
Here I prefer to keep previous code, the logic is more clear.:)
>   	}
>   	/*
>   	 * Recurse into the subdirectories. This recursion is harmless because
> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>   		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>   
> -		if (!entry->bo)
> +		if (!entry->bo || !entry->entries)
In fact, !entry->entries would be checked at the begin of this function 
in next loop, but either way is ok to me here.

Regards,
David Zhou
>   			continue;
>   
>   		r = amdgpu_vm_update_level(adev, vm, entry, level + 1);