[PATCH 2/2] amd/amdkfd: Trigger segfault for early userptr unmmapping
Xiao, Shane
shane.xiao at amd.com
Tue Apr 29 09:49:23 UTC 2025
[Public]
Ping
> -----Original Message-----
> From: Xiao, Shane <shane.xiao at amd.com>
> Sent: Friday, April 25, 2025 11:35 AM
> To: amd-gfx at lists.freedesktop.org; Kuehling, Felix <Felix.Kuehling at amd.com>;
> Koenig, Christian <Christian.Koenig at amd.com>; Yang, Philip
> <Philip.Yang at amd.com>
> Cc: Xiao, Shane <shane.xiao at amd.com>
> Subject: [PATCH 2/2] amd/amdkfd: Trigger segfault for early userptr
> unmmapping
>
> If applications unmap the memory before destroying the userptr, it needs
> trigger a segfault to notify user space to correct the free sequence in VM debug
> mode.
>
> v2: Send GPU access fault to user space
> v3: Report gpu address to user space, remove unnecessary params
>
> Signed-off-by: Shane Xiao <shane.xiao at amd.com>
> ---
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 15 +++++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_events.c | 19 +++++++++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++
> 3 files changed, 36 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index d2ec4130a316..61a698056fb8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -2496,6 +2496,7 @@ static int update_invalid_user_pages(struct
> amdkfd_process_info *process_info,
> struct ttm_operation_ctx ctx = { false, false };
> uint32_t invalid;
> int ret = 0;
> + uint64_t userptr = 0;
>
> mutex_lock(&process_info->notifier_lock);
>
> @@ -2559,6 +2560,20 @@ static int update_invalid_user_pages(struct
> amdkfd_process_info *process_info,
> if (ret != -EFAULT)
> return ret;
>
> + /* If applications unmap memory before destroying the
> userptr
> + * from the KFD, trigger a segmentation fault in VM
> debug mode.
> + */
> + if (amdgpu_ttm_adev(bo->tbo.bdev)-
> >debug_vm_userptr) {
> + amdgpu_ttm_tt_get_userptr(&bo->tbo,
> &userptr);
> + pr_err("User space unmap memory before
> destroying a userptr that refers to it\n");
> + pr_err("The unmap userptr cpu address is
> 0x%llx, gpu address is 0x%llx\n",
> + userptr, mem-
> >va);
> +
> + // Send GPU VM fault to user space
> +
> kfd_signal_vm_fault_event_with_userptr(kfd_lookup_process_by_pid(pr
> ocess_info->pid),
> + mem->va);
> + }
> +
> ret = 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index fecdb6794075..e54e708ed82d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -1177,6 +1177,25 @@ void kfd_signal_hw_exception_event(u32 pasid)
> kfd_unref_process(p);
> }
>
> +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p,
> +uint64_t gpu_va) {
> + struct kfd_process_device *pdd;
> + struct kfd_hsa_memory_exception_data exception_data;
> + int i;
> +
> + memset(&exception_data, 0, sizeof(exception_data));
> + exception_data.va = gpu_va;
> + exception_data.failure.NotPresent = 1;
> +
> + // Send VM seg fault to all kfd process device
> + for (i = 0; i < p->n_pdds; i++) {
> + pdd = p->pdds[i];
> + exception_data.gpu_id = pdd->user_gpu_id;
> + kfd_evict_process_device(pdd);
> + kfd_signal_vm_fault_event(pdd, NULL, &exception_data);
> + }
> +}
> +
> void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
> struct kfd_vm_fault_info *info,
> struct kfd_hsa_memory_exception_data *data)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f6aedf69c644..8703be8077b0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1507,6 +1507,8 @@ int kfd_event_create(struct file *devkfd, struct
> kfd_process *p, int kfd_get_num_events(struct kfd_process *p); int
> kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
>
> +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p,
> +uint64_t gpu_va);
> +
> void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
> struct kfd_vm_fault_info *info,
> struct kfd_hsa_memory_exception_data
> *data);
> --
> 2.25.1
More information about the amd-gfx
mailing list