[PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level
zhoucm1
david1.zhou at amd.com
Wed Jul 12 05:52:01 UTC 2017
On 2017年07月12日 12:56, Felix Kuehling wrote:
> Hi David,
>
> Responses inline ...
Yeah, I see your mean, you mainly want to save some cpu overhead. If
others have no object, feel free add my RB then.
Regards,
David Zhou
>
> On 17-07-12 12:30 AM, zhoucm1 wrote:
>> nice improvement, just some nitpick inline, otherwise the patch is
>> Reviewed-by: Chunming Zhou <david1.zhou at amd.com>.
>>
>> On 2017年07月12日 11:56, Felix Kuehling wrote:
>>> When lots of virtual address spaces is used, there can be thousands
>>> of page table BOs. amdgpu_vm_update_level iterates over all of them
>>> recursively. In many cases only a few or none at all need to be
>>> updated. Minimize unnecessary code execution and memory usage in
>>> those cases.
>>>
>>> This speeds up memory mapping in a synthetic KFD memory mapping
>>> benchmark by roughly a factor two.
>>>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109
>>> +++++++++++++++++----------------
>>> 1 file changed, 55 insertions(+), 54 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index ff5de3a..23b899b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>> {
>>> struct amdgpu_bo *shadow;
>>> struct amdgpu_ring *ring = NULL;
>>> - uint64_t pd_addr, shadow_addr = 0;
>>> + uint64_t pd_addr = 0, shadow_addr = 0;
>>> uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>>> uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>>> unsigned count = 0, pt_idx, ndw = 0;
>>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>> WARN_ON(vm->use_cpu_for_update && shadow);
>>> if (vm->use_cpu_for_update && !shadow) {
>>> - r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> - if (r)
>>> - return r;
>>> - r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> - if (unlikely(r)) {
>>> - amdgpu_bo_kunmap(parent->bo);
>>> - return r;
>>> - }
>>> + /* Defer kmapping until it's actually needed. Some
>>> + * PDBs may need no update at all
>>> + */
>>> params.func = amdgpu_vm_cpu_set_ptes;
>>> + params.ib = (void *)(long)-1;
>>> } else {
>>> - if (shadow) {
>>> - r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
>>> - if (r)
>>> - return r;
>>> - }
>>> - ring = container_of(vm->entity.sched, struct amdgpu_ring,
>>> - sched);
>>> -
>>> - /* padding, etc. */
>>> - ndw = 64;
>>> -
>>> - /* assume the worst case */
>>> - ndw += parent->last_entry_used * 6;
>>> -
>>> - pd_addr = amdgpu_bo_gpu_offset(parent->bo);
>>> -
>>> - if (shadow) {
>>> - shadow_addr = amdgpu_bo_gpu_offset(shadow);
>>> - ndw *= 2;
>>> - } else {
>>> - shadow_addr = 0;
>>> - }
>>> -
>>> - r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> - if (r)
>>> - return r;
>>> -
>>> - params.ib = &job->ibs[0];
>>> + /* Defer IB allocation until it's actually
>>> + * needed. Some PDBs may need no update at all
>>> + */
>>> + params.ib = NULL;
>>> params.func = amdgpu_vm_do_set_ptes;
>>> }
>>> -
>>> /* walk over the address space and update the directory */
>>> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>> struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
>>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>> if (bo == NULL)
>>> continue;
>>> - if (bo->shadow) {
>>> - struct amdgpu_bo *pt_shadow = bo->shadow;
>>> -
>>> - r = amdgpu_ttm_bind(&pt_shadow->tbo,
>>> - &pt_shadow->tbo.mem);
>>> - if (r)
>>> - return r;
>>> - }
>>> -
>>> - pt = amdgpu_bo_gpu_offset(bo);
>>> - pt = amdgpu_gart_get_vm_pde(adev, pt);
>>> + pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
>> Getting offset from amdgpu_bo_gpu_offset(bo) is better I think,
>> although we can assume pt bo is a known domain.
> amdgpu_bo_gpu_offset does a bunch of sanity checks (WARN_ON_ONCE). They
> are redundant here, because we can assume that the page table BOs have
> been properly created and validated by kernel code we can trust more
> than user mode. I saw amdgpu_bo_gpu_offset near the top with "perf top"
> when running my memory mapping benchmark, so I think it makes sense to
> eliminate the function call and redundant sanity checks in this code path.
>
>>> if (parent->entries[pt_idx].addr == pt)
>>> continue;
>>> parent->entries[pt_idx].addr = pt;
>>> + if (!params.ib) {
>>> + if (shadow) {
>>> + r = amdgpu_ttm_bind(&shadow->tbo,
>>> + &shadow->tbo.mem);
>>> + if (r)
>>> + return r;
>>> + }
>>> +
>>> + ring = container_of(vm->entity.sched,
>>> + struct amdgpu_ring, sched);
>>> +
>>> + /* padding, etc. */
>>> + ndw = 64;
>>> +
>>> + /* assume the worst case */
>>> + ndw += (parent->last_entry_used - pt_idx) * 6;
>>> +
>>> + pd_addr = parent->bo->tbo.offset;
>>> +
>>> + if (shadow) {
>>> + shadow_addr = shadow->tbo.offset;
>>> + ndw *= 2;
>>> + } else {
>>> + shadow_addr = 0;
>>> + }
>>> + r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> + if (r)
>>> + return r;
>>> +
>>> + params.ib = &job->ibs[0];
>>> + } else if (!pd_addr) {
>>> + r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> + if (r)
>>> + return r;
>>> + r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> + if (unlikely(r)) {
>>> + amdgpu_bo_kunmap(parent->bo);
>>> + return r;
>>> + }
>>> + }
>>> +
>>> pde = pd_addr + pt_idx * 8;
>>> if (((last_pde + 8 * count) != pde) ||
>>> ((last_pt + incr * count) != pt) ||
>>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>> if (params.func == amdgpu_vm_cpu_set_ptes)
>>> amdgpu_bo_kunmap(parent->bo);
>>> - else if (params.ib->length_dw == 0) {
>>> + else if (params.ib && params.ib->length_dw == 0) {
>>> amdgpu_job_free(job);
>>> - } else {
>>> + } else if (params.ib) {
>>> amdgpu_ring_pad_ib(ring, params.ib);
>>> amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>>> AMDGPU_FENCE_OWNER_VM);
>>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>> amdgpu_bo_fence(parent->bo, fence, true);
>>> dma_fence_put(vm->last_dir_update);
>>> - vm->last_dir_update = dma_fence_get(fence);
>>> - dma_fence_put(fence);
>>> + vm->last_dir_update = fence;
>> Here I prefer to keep previous code, the logic is more clear.:)
> Again, this is a code path that can be called very frequently. So I'd
> like to eliminate redundant atomic operations here.
>
>>> }
>>> /*
>>> * Recurse into the subdirectories. This recursion is harmless
>>> because
>>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>> for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>> struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>>> - if (!entry->bo)
>>> + if (!entry->bo || !entry->entries)
>> In fact, !entry->entries would be checked at the begin of this
>> function in next loop, but either way is ok to me here.
> Doing the check here eliminates the lowest level of recursive function
> calls. That's where it counts the most. Each successive level of the
> recursion runs 512 times as often as the level above it. So completely
> eliminating the function call overhead of the lowest level is a good thing.
>
> Regards,
> Felix
>
>> Regards,
>> David Zhou
>>> continue;
>>> r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list