[PATCH 2/2] amd/amdkfd: Trigger segfault for early userptr unmmapping
Shane Xiao
shane.xiao at amd.com
Thu Apr 24 05:54:26 UTC 2025
If applications unmap the memory before destroying the userptr, it needs
trigger a segfault to notify user space to correct the free sequence in
VM debug mode.
v2: Send GPU access fault to user space
Signed-off-by: Shane Xiao <shane.xiao at amd.com>
---
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 +++++++++++++
drivers/gpu/drm/amd/amdkfd/kfd_events.c | 20 +++++++++++++++++++
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +++
3 files changed, 37 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index d2ec4130a316..876e9df34adf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2496,6 +2496,7 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
struct ttm_operation_ctx ctx = { false, false };
uint32_t invalid;
int ret = 0;
+ uint64_t userptr = 0;
mutex_lock(&process_info->notifier_lock);
@@ -2559,6 +2560,19 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
if (ret != -EFAULT)
return ret;
+ /* If applications unmap memory before destroying the userptr
+ * from the KFD, trigger a segmentation fault in VM debug mode.
+ */
+ if (amdgpu_ttm_adev(bo->tbo.bdev)->debug_vm_userptr) {
+ amdgpu_ttm_tt_get_userptr(&bo->tbo, userptr);
+ pr_err("User space unmap memory before destroying a userptr that refers to it\n");
+ pr_err("The unmap userptr address is 0x%llx\n", userptr);
+
+ // Send GPU VM fault to user space
+ kfd_signal_vm_fault_event_with_userptr(kfd_lookup_process_by_pid(process_info->pid),
+ amdgpu_ttm_adev(bo->tbo.bdev)->kfd.dev, userptr);
+ }
+
ret = 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index fecdb6794075..89943d2146a4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1177,6 +1177,26 @@ void kfd_signal_hw_exception_event(u32 pasid)
kfd_unref_process(p);
}
+void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p,
+ struct kfd_dev *dev , uint64_t userptr)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_hsa_memory_exception_data exception_data;
+ int i;
+
+ memset(&exception_data, 0, sizeof(exception_data));
+ exception_data.va = userptr;
+ exception_data.failure.NotPresent = 1;
+
+ // Send VM seg fault to all kfd process device
+ for (i = 0; i < p->n_pdds; i++) {
+ pdd = p->pdds[i];
+ exception_data.gpu_id = pdd->user_gpu_id;
+ kfd_evict_process_device(pdd);
+ kfd_signal_vm_fault_event(pdd, NULL, &exception_data);
+ }
+}
+
void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
struct kfd_vm_fault_info *info,
struct kfd_hsa_memory_exception_data *data)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f6aedf69c644..34f47dc1cbbd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1507,6 +1507,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
int kfd_get_num_events(struct kfd_process *p);
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
+void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p,
+ struct kfd_dev *dev , uint64_t userptr);
+
void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
struct kfd_vm_fault_info *info,
struct kfd_hsa_memory_exception_data *data);
--
2.25.1
More information about the amd-gfx
mailing list