[PATCH 2/2] amd/amdkfd: Trigger segfault for early userptr unmmapping

Tue Apr 29 23:58:18 UTC 2025

On 2025-04-24 23:35, Shane Xiao wrote:
> If applications unmap the memory before destroying the userptr, it needs
> trigger a segfault to notify user space to correct the free sequence in
> VM debug mode.
>
> v2: Send GPU access fault to user space
> v3: Report gpu address to user space, remove unnecessary params
>
> Signed-off-by: Shane Xiao <shane.xiao at amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 15 +++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c       | 19 +++++++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 ++
>  3 files changed, 36 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index d2ec4130a316..61a698056fb8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -2496,6 +2496,7 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
>  	struct ttm_operation_ctx ctx = { false, false };
>  	uint32_t invalid;
>  	int ret = 0;
> +	uint64_t userptr = 0;
>  
>  	mutex_lock(&process_info->notifier_lock);
>  
> @@ -2559,6 +2560,20 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
>  			if (ret != -EFAULT)
>  				return ret;
>  
> +			/* If applications unmap memory before destroying the userptr
> +			 * from the KFD, trigger a segmentation fault in VM debug mode.
> +			 */
> +			if (amdgpu_ttm_adev(bo->tbo.bdev)->debug_vm_userptr) {
> +				amdgpu_ttm_tt_get_userptr(&bo->tbo, &userptr);

Userptr is only used for printing the message. That's probably unnecessary. You should get that address from user mode as well when it handles the fault event and error or warning messages are enabled (HSAKMT_DEBUG_LEVEL=4). The kernel log doesn't need to be overly verbose. IMO it should fit on one line. It may be useful to include the PID of the offending process. E.g.

	pr_err("Pid %d unmapped memory before destroying userptr at GPU addr %llx",
	       pid_nr(process_info->pid), mem->va);

If you remove the useptr variable, the amdgpu_ttm_tt_get_userptr call and shorten pr_err messages to one line, the series is

Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>

> +				pr_err("User space unmap memory before destroying a userptr that refers to it\n");
> +				pr_err("The unmap userptr cpu address is 0x%llx, gpu address is 0x%llx\n",
> +								userptr, mem->va);
> +
> +				// Send GPU VM fault to user space
> +				kfd_signal_vm_fault_event_with_userptr(kfd_lookup_process_by_pid(process_info->pid),
> +								mem->va);
> +			}
> +
>  			ret = 0;
>  		}
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index fecdb6794075..e54e708ed82d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -1177,6 +1177,25 @@ void kfd_signal_hw_exception_event(u32 pasid)
>  	kfd_unref_process(p);
>  }
>  
> +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va)
> +{
> +	struct kfd_process_device *pdd;
> +	struct kfd_hsa_memory_exception_data exception_data;
> +	int i;
> +
> +	memset(&exception_data, 0, sizeof(exception_data));
> +	exception_data.va = gpu_va;
> +	exception_data.failure.NotPresent = 1;
> +
> +	// Send VM seg fault to all kfd process device
> +	for (i = 0; i < p->n_pdds; i++) {
> +		pdd = p->pdds[i];
> +		exception_data.gpu_id = pdd->user_gpu_id;
> +		kfd_evict_process_device(pdd);
> +		kfd_signal_vm_fault_event(pdd, NULL, &exception_data);
> +	}
> +}
> +
>  void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
>  				struct kfd_vm_fault_info *info,
>  				struct kfd_hsa_memory_exception_data *data)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f6aedf69c644..8703be8077b0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1507,6 +1507,8 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
>  int kfd_get_num_events(struct kfd_process *p);
>  int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
>  
> +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va);
> +
>  void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
>  				struct kfd_vm_fault_info *info,
>  				struct kfd_hsa_memory_exception_data *data);