[PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level
Felix Kuehling
felix.kuehling at amd.com
Wed Jul 12 19:38:52 UTC 2017
On 17-07-12 03:50 AM, Christian König wrote:
>
> That optimization is unnecessary.
>
> I have a patch in the pipeline that makes the VM BO permanently CPU
> mapped and also fixes a couple of bugs in those code path.
You mean just for the CPU-update path? I think deferring the SDMA IB
allocation still makes sense. In fact that's the code path I tested.
>
> Give me a moment to clean that up and send it out.
Sure. BTW, I found that the most expensive operation turned out to be
amdgpu_vm_move_pt_bos_in_lru. It has to recurse through all PT BOs and
does lots of redundant list pointer manipulations for two lists (LRU and
swap), and kref get/put.
For KFD VMs I'm removing the call to it. I think it's useless for us.
amdgpu_vm_validate_pt_bos is also expensive, but not as bad because it's
only done if there was an eviction.
I was thinking about ways to optimize this. It should be possible to
allocate multiple PTBs and PDBs in a single BO in amdgpu_vm_alloc_pts.
That would potentially save lots of time in vm_move_pt_bos and
vm_validate. What do you think?
Regards,
Felix
>
> Regards,
> Christian.
>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109
>> +++++++++++++++++----------------
>> 1 file changed, 55 insertions(+), 54 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index ff5de3a..23b899b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>> {
>> struct amdgpu_bo *shadow;
>> struct amdgpu_ring *ring = NULL;
>> - uint64_t pd_addr, shadow_addr = 0;
>> + uint64_t pd_addr = 0, shadow_addr = 0;
>> uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>> uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>> unsigned count = 0, pt_idx, ndw = 0;
>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>> WARN_ON(vm->use_cpu_for_update && shadow);
>> if (vm->use_cpu_for_update && !shadow) {
>> - r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>> - if (r)
>> - return r;
>> - r = amdgpu_vm_bo_wait(adev, parent->bo);
>> - if (unlikely(r)) {
>> - amdgpu_bo_kunmap(parent->bo);
>> - return r;
>> - }
>> + /* Defer kmapping until it's actually needed. Some
>> + * PDBs may need no update at all
>> + */
>> params.func = amdgpu_vm_cpu_set_ptes;
>> + params.ib = (void *)(long)-1;
>> } else {
>> - if (shadow) {
>> - r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
>> - if (r)
>> - return r;
>> - }
>> - ring = container_of(vm->entity.sched, struct amdgpu_ring,
>> - sched);
>> -
>> - /* padding, etc. */
>> - ndw = 64;
>> -
>> - /* assume the worst case */
>> - ndw += parent->last_entry_used * 6;
>> -
>> - pd_addr = amdgpu_bo_gpu_offset(parent->bo);
>> -
>> - if (shadow) {
>> - shadow_addr = amdgpu_bo_gpu_offset(shadow);
>> - ndw *= 2;
>> - } else {
>> - shadow_addr = 0;
>> - }
>> -
>> - r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>> - if (r)
>> - return r;
>> -
>> - params.ib = &job->ibs[0];
>> + /* Defer IB allocation until it's actually
>> + * needed. Some PDBs may need no update at all
>> + */
>> + params.ib = NULL;
>> params.func = amdgpu_vm_do_set_ptes;
>> }
>> -
>> /* walk over the address space and update the directory */
>> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>> struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>> if (bo == NULL)
>> continue;
>> - if (bo->shadow) {
>> - struct amdgpu_bo *pt_shadow = bo->shadow;
>> -
>> - r = amdgpu_ttm_bind(&pt_shadow->tbo,
>> - &pt_shadow->tbo.mem);
>> - if (r)
>> - return r;
>> - }
>> -
>> - pt = amdgpu_bo_gpu_offset(bo);
>> - pt = amdgpu_gart_get_vm_pde(adev, pt);
>> + pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
>> if (parent->entries[pt_idx].addr == pt)
>> continue;
>> parent->entries[pt_idx].addr = pt;
>> + if (!params.ib) {
>> + if (shadow) {
>> + r = amdgpu_ttm_bind(&shadow->tbo,
>> + &shadow->tbo.mem);
>> + if (r)
>> + return r;
>> + }
>> +
>> + ring = container_of(vm->entity.sched,
>> + struct amdgpu_ring, sched);
>> +
>> + /* padding, etc. */
>> + ndw = 64;
>> +
>> + /* assume the worst case */
>> + ndw += (parent->last_entry_used - pt_idx) * 6;
>> +
>> + pd_addr = parent->bo->tbo.offset;
>> +
>> + if (shadow) {
>> + shadow_addr = shadow->tbo.offset;
>> + ndw *= 2;
>> + } else {
>> + shadow_addr = 0;
>> + }
>> + r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>> + if (r)
>> + return r;
>> +
>> + params.ib = &job->ibs[0];
>> + } else if (!pd_addr) {
>> + r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>> + if (r)
>> + return r;
>> + r = amdgpu_vm_bo_wait(adev, parent->bo);
>> + if (unlikely(r)) {
>> + amdgpu_bo_kunmap(parent->bo);
>> + return r;
>> + }
>> + }
>> +
>> pde = pd_addr + pt_idx * 8;
>> if (((last_pde + 8 * count) != pde) ||
>> ((last_pt + incr * count) != pt) ||
>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>> if (params.func == amdgpu_vm_cpu_set_ptes)
>> amdgpu_bo_kunmap(parent->bo);
>> - else if (params.ib->length_dw == 0) {
>> + else if (params.ib && params.ib->length_dw == 0) {
>> amdgpu_job_free(job);
>> - } else {
>> + } else if (params.ib) {
>> amdgpu_ring_pad_ib(ring, params.ib);
>> amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>> AMDGPU_FENCE_OWNER_VM);
>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>> amdgpu_bo_fence(parent->bo, fence, true);
>> dma_fence_put(vm->last_dir_update);
>> - vm->last_dir_update = dma_fence_get(fence);
>> - dma_fence_put(fence);
>> + vm->last_dir_update = fence;
>> }
>> /*
>> * Recurse into the subdirectories. This recursion is harmless
>> because
>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>> struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>> - if (!entry->bo)
>> + if (!entry->bo || !entry->entries)
>> continue;
>> r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
>
>
More information about the amd-gfx
mailing list