[PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level

Wed Jul 12 19:38:52 UTC 2017

On 17-07-12 03:50 AM, Christian König wrote:
>  
> That optimization is unnecessary.
>
> I have a patch in the pipeline that makes the VM BO permanently CPU
> mapped and also fixes a couple of bugs in those code path.

You mean just for the CPU-update path? I think deferring the SDMA IB
allocation still makes sense. In fact that's the code path I tested.

>
> Give me a moment to clean that up and send it out.

Sure. BTW, I found that the most expensive operation turned out to be
amdgpu_vm_move_pt_bos_in_lru. It has to recurse through all PT BOs and
does lots of redundant list pointer manipulations for two lists (LRU and
swap), and kref get/put.

For KFD VMs I'm removing the call to it. I think it's useless for us.

amdgpu_vm_validate_pt_bos is also expensive, but not as bad because it's
only done if there was an eviction.

I was thinking about ways to optimize this. It should be possible to
allocate multiple PTBs and PDBs in a single BO in amdgpu_vm_alloc_pts.
That would potentially save lots of time in vm_move_pt_bos and
vm_validate. What do you think?

Regards,
  Felix

>
> Regards,
> Christian.
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109
>> +++++++++++++++++----------------
>>   1 file changed, 55 insertions(+), 54 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index ff5de3a..23b899b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>   {
>>       struct amdgpu_bo *shadow;
>>       struct amdgpu_ring *ring = NULL;
>> -    uint64_t pd_addr, shadow_addr = 0;
>> +    uint64_t pd_addr = 0, shadow_addr = 0;
>>       uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>>       uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>>       unsigned count = 0, pt_idx, ndw = 0;
>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>         WARN_ON(vm->use_cpu_for_update && shadow);
>>       if (vm->use_cpu_for_update && !shadow) {
>> -        r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>> -        if (r)
>> -            return r;
>> -        r = amdgpu_vm_bo_wait(adev, parent->bo);
>> -        if (unlikely(r)) {
>> -            amdgpu_bo_kunmap(parent->bo);
>> -            return r;
>> -        }
>> +        /* Defer kmapping until it's actually needed. Some
>> +         * PDBs may need no update at all
>> +         */
>>           params.func = amdgpu_vm_cpu_set_ptes;
>> +        params.ib = (void *)(long)-1;
>>       } else {
>> -        if (shadow) {
>> -            r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
>> -            if (r)
>> -                return r;
>> -        }
>> -        ring = container_of(vm->entity.sched, struct amdgpu_ring,
>> -                    sched);
>> -
>> -        /* padding, etc. */
>> -        ndw = 64;
>> -
>> -        /* assume the worst case */
>> -        ndw += parent->last_entry_used * 6;
>> -
>> -        pd_addr = amdgpu_bo_gpu_offset(parent->bo);
>> -
>> -        if (shadow) {
>> -            shadow_addr = amdgpu_bo_gpu_offset(shadow);
>> -            ndw *= 2;
>> -        } else {
>> -            shadow_addr = 0;
>> -        }
>> -
>> -        r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>> -        if (r)
>> -            return r;
>> -
>> -        params.ib = &job->ibs[0];
>> +        /* Defer IB allocation until it's actually
>> +         * needed. Some PDBs may need no update at all
>> +         */
>> +        params.ib = NULL;
>>           params.func = amdgpu_vm_do_set_ptes;
>>       }
>>   -
>>       /* walk over the address space and update the directory */
>>       for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>           struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>           if (bo == NULL)
>>               continue;
>>   -        if (bo->shadow) {
>> -            struct amdgpu_bo *pt_shadow = bo->shadow;
>> -
>> -            r = amdgpu_ttm_bind(&pt_shadow->tbo,
>> -                        &pt_shadow->tbo.mem);
>> -            if (r)
>> -                return r;
>> -        }
>> -
>> -        pt = amdgpu_bo_gpu_offset(bo);
>> -        pt = amdgpu_gart_get_vm_pde(adev, pt);
>> +        pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
>>           if (parent->entries[pt_idx].addr == pt)
>>               continue;
>>             parent->entries[pt_idx].addr = pt;
>>   +        if (!params.ib) {
>> +            if (shadow) {
>> +                r = amdgpu_ttm_bind(&shadow->tbo,
>> +                            &shadow->tbo.mem);
>> +                if (r)
>> +                    return r;
>> +            }
>> +
>> +            ring = container_of(vm->entity.sched,
>> +                        struct amdgpu_ring, sched);
>> +
>> +            /* padding, etc. */
>> +            ndw = 64;
>> +
>> +            /* assume the worst case */
>> +            ndw += (parent->last_entry_used - pt_idx) * 6;
>> +
>> +            pd_addr = parent->bo->tbo.offset;
>> +
>> +            if (shadow) {
>> +                shadow_addr = shadow->tbo.offset;
>> +                ndw *= 2;
>> +            } else {
>> +                shadow_addr = 0;
>> +            }
>> +            r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>> +            if (r)
>> +                return r;
>> +
>> +            params.ib = &job->ibs[0];
>> +        } else if (!pd_addr) {
>> +            r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>> +            if (r)
>> +                return r;
>> +            r = amdgpu_vm_bo_wait(adev, parent->bo);
>> +            if (unlikely(r)) {
>> +                amdgpu_bo_kunmap(parent->bo);
>> +                return r;
>> +            }
>> +        }
>> +
>>           pde = pd_addr + pt_idx * 8;
>>           if (((last_pde + 8 * count) != pde) ||
>>               ((last_pt + incr * count) != pt) ||
>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>         if (params.func == amdgpu_vm_cpu_set_ptes)
>>           amdgpu_bo_kunmap(parent->bo);
>> -    else if (params.ib->length_dw == 0) {
>> +    else if (params.ib && params.ib->length_dw == 0) {
>>           amdgpu_job_free(job);
>> -    } else {
>> +    } else if (params.ib) {
>>           amdgpu_ring_pad_ib(ring, params.ib);
>>           amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>>                    AMDGPU_FENCE_OWNER_VM);
>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>             amdgpu_bo_fence(parent->bo, fence, true);
>>           dma_fence_put(vm->last_dir_update);
>> -        vm->last_dir_update = dma_fence_get(fence);
>> -        dma_fence_put(fence);
>> +        vm->last_dir_update = fence;
>>       }
>>       /*
>>        * Recurse into the subdirectories. This recursion is harmless
>> because
>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>       for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>           struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>>   -        if (!entry->bo)
>> +        if (!entry->bo || !entry->entries)
>>               continue;
>>             r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
>
>