<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<p><br>
</p>
<div class="moz-cite-prefix">On 2023-09-11 21:11, Felix Kuehling
wrote:<br>
</div>
<blockquote type="cite" cite="mid:b8b95ee3-e608-a996-db65-cb02c418620a@amd.com">On
2023-09-11 15:55, Philip Yang wrote:
<br>
<blockquote type="cite">Otherwise GPU may access the stale mapping
and generate IOMMU
<br>
IO_PAGE_FAULT.
<br>
<br>
Move this to inside p->mutex to prevent multiple threads
mapping and
<br>
unmapping concurrently race condition.
<br>
<br>
After kfd_mem_dmaunmap_attachment is removed from
unmap_bo_from_gpuvm,
<br>
kfd_mem_dmaunmap_attachment is called if failed to map to GPUs,
and
<br>
before free the mem attachment in case failed to unmap from
GPUs.
<br>
<br>
Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
<br>
---
<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
<br>
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 22
+++++++++++++++---
<br>
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 23
++++++++++++-------
<br>
3 files changed, 35 insertions(+), 11 deletions(-)
<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
<br>
index 559f14cc0a99..dff79a623f4a 100644
<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
<br>
@@ -304,6 +304,7 @@ int
amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device
*adev,
<br>
struct kgd_mem *mem, void *drm_priv);
<br>
int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
<br>
struct amdgpu_device *adev, struct kgd_mem *mem, void
*drm_priv);
<br>
+void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct amdgpu_vm *vm,
struct kgd_mem *mem);
<br>
</blockquote>
<br>
For consistency with the other amdgpu_amdkfd APIs, please replace
the vm parameter with a drm_priv parameter and do the
drm_priv_to_vm translation inside
amdgpu_amdkfd_gpuvm_dmaunmap_mem.
<br>
</blockquote>
ok.<br>
<blockquote type="cite" cite="mid:b8b95ee3-e608-a996-db65-cb02c418620a@amd.com">
<br>
<br>
<blockquote type="cite"> int amdgpu_amdkfd_gpuvm_sync_memory(
<br>
struct amdgpu_device *adev, struct kgd_mem *mem, bool
intr);
<br>
int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem
*mem,
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
<br>
index b5b940485059..ae767ad7afa2 100644
<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
<br>
@@ -1249,8 +1249,6 @@ static void unmap_bo_from_gpuvm(struct
kgd_mem *mem,
<br>
amdgpu_vm_clear_freed(adev, vm,
&bo_va->last_pt_update);
<br>
amdgpu_sync_fence(sync, bo_va->last_pt_update);
<br>
-
<br>
- kfd_mem_dmaunmap_attachment(mem, entry);
<br>
}
<br>
static int update_gpuvm_pte(struct kgd_mem *mem,
<br>
@@ -1305,6 +1303,7 @@ static int map_bo_to_gpuvm(struct kgd_mem
*mem,
<br>
update_gpuvm_pte_failed:
<br>
unmap_bo_from_gpuvm(mem, entry, sync);
<br>
+ kfd_mem_dmaunmap_attachment(mem, entry);
<br>
return ret;
<br>
}
<br>
@@ -1910,8 +1909,10 @@ int
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
<br>
mem->va + bo_size * (1 + mem->aql_queue));
<br>
/* Remove from VM internal data structures */
<br>
- list_for_each_entry_safe(entry, tmp,
&mem->attachments, list)
<br>
+ list_for_each_entry_safe(entry, tmp,
&mem->attachments, list) {
<br>
+ kfd_mem_dmaunmap_attachment(mem, entry);
<br>
</blockquote>
<br>
In theory this should not be needed, because user mode is required
to unmap all mappings before freeing the BO. If </blockquote>
amdgpu_amdkfd_gpuvm_sync_memory may return -ERESTARTSYS, or if
vm_validate_pt_pd_bos failed, then amdgpu_amdkfd_gpuvm_dmaunmap_mem
is not called, this is needed to avoid leaking dma mapping.<br>
<blockquote type="cite" cite="mid:b8b95ee3-e608-a996-db65-cb02c418620a@amd.com">(mapped_to_gpu_memory
> 0), this function returns -EBUSY. For SG BOs, you may run
into this error message:
<br>
<br>
pr_err("SG Table of BO is UNEXPECTEDLY NULL");
<br>
<br>
If you decide to leave this dmaunmap_attachment here, you may want
to turn that error into a pr_debug and remove "UNEXPECTEDLY".
<br>
</blockquote>
<p>Thanks, I missed this, will change it in next version.</p>
<p>Regards,</p>
<p>Philip<br>
</p>
<blockquote type="cite" cite="mid:b8b95ee3-e608-a996-db65-cb02c418620a@amd.com">
<br>
Regards,
<br>
Felix
<br>
<br>
<br>
<blockquote type="cite"> kfd_mem_detach(entry);
<br>
+ }
<br>
ret = unreserve_bo_and_vms(&ctx, false, false);
<br>
@@ -2085,6 +2086,21 @@ int
amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
<br>
return ret;
<br>
}
<br>
+void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct amdgpu_vm *vm,
struct kgd_mem *mem)
<br>
+{
<br>
+ struct kfd_mem_attachment *entry;
<br>
+
<br>
+ mutex_lock(&mem->lock);
<br>
+
<br>
+ list_for_each_entry(entry, &mem->attachments, list)
{
<br>
+ if (entry->bo_va->base.vm != vm)
<br>
+ continue;
<br>
+ kfd_mem_dmaunmap_attachment(mem, entry);
<br>
+ }
<br>
+
<br>
+ mutex_unlock(&mem->lock);
<br>
+}
<br>
+
<br>
int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
<br>
struct amdgpu_device *adev, struct kgd_mem *mem, void
*drm_priv)
<br>
{
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
<br>
index 08687ce0aa8b..645628ff1faf 100644
<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
<br>
@@ -1432,17 +1432,24 @@ static int
kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
<br>
goto sync_memory_failed;
<br>
}
<br>
}
<br>
- mutex_unlock(&p->mutex);
<br>
- if (flush_tlb) {
<br>
- /* Flush TLBs after waiting for the page table updates
to complete */
<br>
- for (i = 0; i < args->n_devices; i++) {
<br>
- peer_pdd = kfd_process_device_data_by_id(p,
devices_arr[i]);
<br>
- if (WARN_ON_ONCE(!peer_pdd))
<br>
- continue;
<br>
+ /* Flush TLBs after waiting for the page table updates to
complete */
<br>
+ for (i = 0; i < args->n_devices; i++) {
<br>
+ struct amdgpu_vm *vm;
<br>
+
<br>
+ peer_pdd = kfd_process_device_data_by_id(p,
devices_arr[i]);
<br>
+ if (WARN_ON_ONCE(!peer_pdd))
<br>
+ continue;
<br>
+ if (flush_tlb)
<br>
kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
<br>
- }
<br>
+
<br>
+ /* Remove dma mapping after tlb flush to avoid
IO_PAGE_FAULT */
<br>
+ vm = drm_priv_to_vm(peer_pdd->drm_priv);
<br>
+ amdgpu_amdkfd_gpuvm_dmaunmap_mem(vm, mem);
<br>
}
<br>
+
<br>
+ mutex_unlock(&p->mutex);
<br>
+
<br>
kfree(devices_arr);
<br>
return 0;
<br>
</blockquote>
</blockquote>
</body>
</html>