[PATCH 2/2] drm/amdkfd: optimize memory mapping latency
Felix Kuehling
felix.kuehling at amd.com
Tue Jun 1 16:05:21 UTC 2021
Am 2021-05-29 um 6:51 p.m. schrieb Eric Huang:
> 1. conditionally flush TLBs after map.
> 2. add heavy weight TLBs flushing after unmap.
>
> Signed-off-by: Eric Huang <jinhuieric.huang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 24 +++++++++++--------
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 6 ++---
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++--
> 8 files changed, 25 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 2560977760b3..997258c24ef2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -85,6 +85,7 @@ struct kgd_mem {
>
> bool aql_queue;
> bool is_imported;
> + bool table_freed;
> };
>
> /* KFD Memory Eviction */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 385c33675227..8ac0d849fd3f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1132,6 +1132,8 @@ static int update_gpuvm_pte(struct kgd_mem *mem,
> return ret;
> }
>
> + mem->table_freed = bo_va->table_freed;
> +
I think this should be
mem->table_freed = mem->table_freed || bo_va->table_freed;
That way, on a multi-GPU system, mem->table_freed gets set to true if
any GPU freed a page table. Then somewhere, this needs to be reset to false.
However, that means, if one GPU frees a page table, all GPUs need to
flush, which may be unnecessary. A better alternative would be to do the
TLB flushing right here, only for the affected GPU, instead of returning
an aggregated "table_freed" all the way back to
kfd_ioctl_map_memory_to_gpu, which flushes all GPUs.
Finally, bo_va->table_freed is only used once, after the
amdgpu_vm_bo_update call returns. So there is no good reason to store
this permanently in the bo_va structure. It would be better to just add
an output parameter to amdgpu_vm_bo_update.
Regards,
Felix
> return amdgpu_sync_fence(sync, bo_va->last_pt_update);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 36e7f088d4ee..0e0f27f779cd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -87,6 +87,7 @@ struct amdgpu_bo_va {
> bool cleared;
>
> bool is_xgmi;
> + bool table_freed;
> };
>
> struct amdgpu_bo {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 95b94c95adac..ff3eb8395017 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1885,7 +1885,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
> resv, mapping->start,
> mapping->last, update_flags,
> mapping->offset, mem,
> - pages_addr, last_update, NULL,
> + pages_addr, last_update, &bo_va->table_freed,
> vram_base_offset);
> if (r)
> return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 960913a35ee4..c45ccd1d03c0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1658,16 +1658,18 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
> }
>
> /* Flush TLBs after waiting for the page table updates to complete */
> - for (i = 0; i < args->n_devices; i++) {
> - peer = kfd_device_by_id(devices_arr[i]);
> - if (WARN_ON_ONCE(!peer))
> - continue;
> - peer_pdd = kfd_get_process_device_data(peer, p);
> - if (WARN_ON_ONCE(!peer_pdd))
> - continue;
> - if (!amdgpu_read_lock(peer->ddev, true)) {
> - kfd_flush_tlb(peer_pdd);
> - amdgpu_read_unlock(peer->ddev);
> + if (((struct kgd_mem *)mem)->table_freed) {
> + for (i = 0; i < args->n_devices; i++) {
> + peer = kfd_device_by_id(devices_arr[i]);
> + if (WARN_ON_ONCE(!peer))
> + continue;
> + peer_pdd = kfd_get_process_device_data(peer, p);
> + if (WARN_ON_ONCE(!peer_pdd))
> + continue;
> + if (!amdgpu_read_lock(peer->ddev, true)) {
> + kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
> + amdgpu_read_unlock(peer->ddev);
> + }
> }
> }
>
> @@ -1766,6 +1768,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
> amdgpu_read_unlock(peer->ddev);
> goto unmap_memory_from_gpu_failed;
> }
> + ((struct kgd_mem *)mem)->table_freed = false;
> + kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
> amdgpu_read_unlock(peer->ddev);
> args->n_success = i+1;
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c1bea1f7627b..a4920bc5cfbc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -278,7 +278,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
> qpd->vmid,
> qpd->page_table_base);
> /* invalidate the VM context after pasid and vmid mapping is set up */
> - kfd_flush_tlb(qpd_to_pdd(qpd));
> + kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY);
>
> if (dqm->dev->kfd2kgd->set_scratch_backing_va)
> dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->kgd,
> @@ -314,7 +314,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
> if (flush_texture_cache_nocpsch(q->device, qpd))
> pr_err("Failed to flush TC\n");
>
> - kfd_flush_tlb(qpd_to_pdd(qpd));
> + kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY);
>
> /* Release the vmid mapping */
> set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
> @@ -885,7 +885,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
> dqm->dev->kgd,
> qpd->vmid,
> qpd->page_table_base);
> - kfd_flush_tlb(pdd);
> + kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
> }
>
> /* Take a safe reference to the mm_struct, which may otherwise
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ecdd5e782b81..edce3ecf207d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1338,7 +1338,7 @@ void kfd_signal_reset_event(struct kfd_dev *dev);
>
> void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid);
>
> -void kfd_flush_tlb(struct kfd_process_device *pdd);
> +void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type);
>
> int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 4ab9da288f90..a03373743a3d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -2161,7 +2161,7 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
> KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
> }
>
> -void kfd_flush_tlb(struct kfd_process_device *pdd)
> +void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
> {
> struct kfd_dev *dev = pdd->dev;
>
> @@ -2174,7 +2174,7 @@ void kfd_flush_tlb(struct kfd_process_device *pdd)
> pdd->qpd.vmid);
> } else {
> amdgpu_amdkfd_flush_gpu_tlb_pasid(dev->kgd,
> - pdd->process->pasid, TLB_FLUSH_LEGACY);
> + pdd->process->pasid, type);
> }
> }
>
More information about the amd-gfx
mailing list