[PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level
zhoucm1
david1.zhou at amd.com
Wed Jul 12 04:30:41 UTC 2017
nice improvement, just some nitpick inline, otherwise the patch is
Reviewed-by: Chunming Zhou <david1.zhou at amd.com>.
On 2017年07月12日 11:56, Felix Kuehling wrote:
> When lots of virtual address spaces is used, there can be thousands
> of page table BOs. amdgpu_vm_update_level iterates over all of them
> recursively. In many cases only a few or none at all need to be
> updated. Minimize unnecessary code execution and memory usage in
> those cases.
>
> This speeds up memory mapping in a synthetic KFD memory mapping
> benchmark by roughly a factor two.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 +++++++++++++++++----------------
> 1 file changed, 55 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index ff5de3a..23b899b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
> {
> struct amdgpu_bo *shadow;
> struct amdgpu_ring *ring = NULL;
> - uint64_t pd_addr, shadow_addr = 0;
> + uint64_t pd_addr = 0, shadow_addr = 0;
> uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
> uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
> unsigned count = 0, pt_idx, ndw = 0;
> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>
> WARN_ON(vm->use_cpu_for_update && shadow);
> if (vm->use_cpu_for_update && !shadow) {
> - r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
> - if (r)
> - return r;
> - r = amdgpu_vm_bo_wait(adev, parent->bo);
> - if (unlikely(r)) {
> - amdgpu_bo_kunmap(parent->bo);
> - return r;
> - }
> + /* Defer kmapping until it's actually needed. Some
> + * PDBs may need no update at all
> + */
> params.func = amdgpu_vm_cpu_set_ptes;
> + params.ib = (void *)(long)-1;
> } else {
> - if (shadow) {
> - r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
> - if (r)
> - return r;
> - }
> - ring = container_of(vm->entity.sched, struct amdgpu_ring,
> - sched);
> -
> - /* padding, etc. */
> - ndw = 64;
> -
> - /* assume the worst case */
> - ndw += parent->last_entry_used * 6;
> -
> - pd_addr = amdgpu_bo_gpu_offset(parent->bo);
> -
> - if (shadow) {
> - shadow_addr = amdgpu_bo_gpu_offset(shadow);
> - ndw *= 2;
> - } else {
> - shadow_addr = 0;
> - }
> -
> - r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
> - if (r)
> - return r;
> -
> - params.ib = &job->ibs[0];
> + /* Defer IB allocation until it's actually
> + * needed. Some PDBs may need no update at all
> + */
> + params.ib = NULL;
> params.func = amdgpu_vm_do_set_ptes;
> }
>
> -
> /* walk over the address space and update the directory */
> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
> struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
> if (bo == NULL)
> continue;
>
> - if (bo->shadow) {
> - struct amdgpu_bo *pt_shadow = bo->shadow;
> -
> - r = amdgpu_ttm_bind(&pt_shadow->tbo,
> - &pt_shadow->tbo.mem);
> - if (r)
> - return r;
> - }
> -
> - pt = amdgpu_bo_gpu_offset(bo);
> - pt = amdgpu_gart_get_vm_pde(adev, pt);
> + pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
Getting offset from amdgpu_bo_gpu_offset(bo) is better I think,
although we can assume pt bo is a known domain.
> if (parent->entries[pt_idx].addr == pt)
> continue;
>
> parent->entries[pt_idx].addr = pt;
>
> + if (!params.ib) {
> + if (shadow) {
> + r = amdgpu_ttm_bind(&shadow->tbo,
> + &shadow->tbo.mem);
> + if (r)
> + return r;
> + }
> +
> + ring = container_of(vm->entity.sched,
> + struct amdgpu_ring, sched);
> +
> + /* padding, etc. */
> + ndw = 64;
> +
> + /* assume the worst case */
> + ndw += (parent->last_entry_used - pt_idx) * 6;
> +
> + pd_addr = parent->bo->tbo.offset;
> +
> + if (shadow) {
> + shadow_addr = shadow->tbo.offset;
> + ndw *= 2;
> + } else {
> + shadow_addr = 0;
> + }
> + r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
> + if (r)
> + return r;
> +
> + params.ib = &job->ibs[0];
> + } else if (!pd_addr) {
> + r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
> + if (r)
> + return r;
> + r = amdgpu_vm_bo_wait(adev, parent->bo);
> + if (unlikely(r)) {
> + amdgpu_bo_kunmap(parent->bo);
> + return r;
> + }
> + }
> +
> pde = pd_addr + pt_idx * 8;
> if (((last_pde + 8 * count) != pde) ||
> ((last_pt + incr * count) != pt) ||
> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>
> if (params.func == amdgpu_vm_cpu_set_ptes)
> amdgpu_bo_kunmap(parent->bo);
> - else if (params.ib->length_dw == 0) {
> + else if (params.ib && params.ib->length_dw == 0) {
> amdgpu_job_free(job);
> - } else {
> + } else if (params.ib) {
> amdgpu_ring_pad_ib(ring, params.ib);
> amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
> AMDGPU_FENCE_OWNER_VM);
> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>
> amdgpu_bo_fence(parent->bo, fence, true);
> dma_fence_put(vm->last_dir_update);
> - vm->last_dir_update = dma_fence_get(fence);
> - dma_fence_put(fence);
> + vm->last_dir_update = fence;
Here I prefer to keep previous code, the logic is more clear.:)
> }
> /*
> * Recurse into the subdirectories. This recursion is harmless because
> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
> struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>
> - if (!entry->bo)
> + if (!entry->bo || !entry->entries)
In fact, !entry->entries would be checked at the begin of this function
in next loop, but either way is ok to me here.
Regards,
David Zhou
> continue;
>
> r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
More information about the amd-gfx
mailing list