[PATCH v2] drm/amdkfd: Add kfd driver function to support hot plug/unplug amdgpu devices

Thu Oct 17 21:04:26 UTC 2024

On 2024-10-15 17:21, Xiaogang.Chen wrote:
> From: Xiaogang Chen <xiaogang.chen at amd.com>
>
> The purpose of this patch is having kfd driver function as expected during AMD
> gpu device plug/unplug.
>
> When an AMD gpu device got unplug kfd driver stops all queues from this device.
> If there are user processes still ref the render node this device is marked as
> invalid. kfd driver will return error to following requests to the device from
> all existing user processes. Existing user processes can still use remaining
> gpu devices during/after unplug event.
>
> After all refs to the device have been closed from user space kfd driver
> topology got updated by removing correspodent kfd nodes.
>
> User space can use remaining gpu devices that are valid at same time. When all
> AMD gpu devices got removed kfd driver will not allow open /dev/kfd request.
>
> Unplugged AMD gpu devices can be re-plugged. kfd driver will use added devices
> and function as usual.
>
> Signed-off-by: Xiaogang Chen<Xiaogang.Chen at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  5 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  7 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 78 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       | 43 ++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c  |  6 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  7 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 13 +++-
>   .../amd/amdkfd/kfd_process_queue_manager.c    | 24 ++++++
>   9 files changed, 183 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index b545940e512b..651ae0775f80 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -248,6 +248,11 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
>   		kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
>   }
>   
> +void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd)
> +{
> +       kgd2kfd_teardown_kfd_device(kfd);
> +}
> +
>   void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
>   {
>   	if (adev->kfd.dev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 7e0a22072536..bd241f569b79 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -152,6 +152,7 @@ struct amdkfd_process_info {
>   
>   int amdgpu_amdkfd_init(void);
>   void amdgpu_amdkfd_fini(void);
> +void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd);
>   
>   void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
>   int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
> @@ -431,6 +432,7 @@ int kgd2kfd_check_and_lock_kfd(void);
>   void kgd2kfd_unlock_kfd(void);
>   int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
>   int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
> +void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd);
>   #else
>   static inline int kgd2kfd_init(void)
>   {
> @@ -511,5 +513,10 @@ static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
>   {
>   	return 0;
>   }
> +
> +void kgd2kfd_teardown_processes(void)
> +{
> +}
> +
>   #endif
>   #endif /* AMDGPU_AMDKFD_H_INCLUDED */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 1e47655e02c6..4529d7a88b98 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3315,7 +3315,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
>   	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>   	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>   
> -	amdgpu_amdkfd_suspend(adev, false);
> +	if (adev->kfd.dev)
> +		amdgpu_amdkfd_teardown_kfd_device(adev->kfd.dev);
>   
>   	/* Workaroud for ASICs need to disable SMC first */
>   	amdgpu_device_smu_fini_early(adev);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index a1f191a5984b..d246f72ae0e9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -327,6 +327,13 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>   		err = -EINVAL;
>   		goto err_pdd;
>   	}
> +
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		pr_debug("gpu 0x%x is not available\n", args->gpu_id);
> +		err = -EINVAL;
> +		goto err_pdd;
> +	}
> +

Instead of duplicating this in all the ioctl functions, could this check 
be done in kfd_process_device_data_by_id?

>   	dev = pdd->dev;
>   
>   	pdd = kfd_bind_process_to_device(dev, p);
> @@ -578,6 +585,12 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
>   		goto err_pdd;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		pr_debug("gpu 0x%x is not available\n", args->gpu_id);
> +		err = -EINVAL;
> +		goto err_pdd;
> +	}
> +
>   	pdd = kfd_bind_process_to_device(pdd->dev, p);
>   	if (IS_ERR(pdd)) {
>   		err = -ESRCH;
> @@ -621,6 +634,11 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,
>   		goto err_pdd;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		err = -EINVAL;
> +		goto err_pdd;
> +	}
> +
>   	pdd = kfd_bind_process_to_device(pdd->dev, p);
>   	if (IS_ERR(pdd)) {
>   		err = -ESRCH;
> @@ -704,6 +722,9 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,
>   	for (i = 0; i < p->n_pdds; i++) {
>   		struct kfd_process_device *pdd = p->pdds[i];
>   
> +		if (!is_kfd_process_device_valid(pdd))
> +			continue;
> +
>   		pAperture =
>   			&args->process_apertures[args->num_of_nodes];
>   		pAperture->gpu_id = pdd->dev->id;
> @@ -779,6 +800,9 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,
>   	for (i = 0; i < min(p->n_pdds, args->num_of_nodes); i++) {
>   		struct kfd_process_device *pdd = p->pdds[i];
>   
> +		if (!is_kfd_process_device_valid(pdd))
> +			continue;
> +
>   		pa[i].gpu_id = pdd->dev->id;
>   		pa[i].lds_base = pdd->lds_base;
>   		pa[i].lds_limit = pdd->lds_limit;
> @@ -901,6 +925,11 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
>   		goto bind_process_to_device_fail;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		err = PTR_ERR(pdd);
> +		goto bind_process_to_device_fail;
> +	}
> +
>   	pdd->qpd.sh_hidden_private_base = args->va_addr;
>   
>   	mutex_unlock(&p->mutex);
> @@ -981,6 +1010,11 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
>   		goto err_pdd;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		ret = -EINVAL;
> +		goto err_pdd;
> +	}
> +
>   	if (pdd->drm_file) {
>   		ret = pdd->drm_file == drm_file ? 0 : -EBUSY;
>   		goto err_drm_file;
> @@ -1031,6 +1065,10 @@ static int kfd_ioctl_get_available_memory(struct file *filep,
>   
>   	if (!pdd)
>   		return -EINVAL;
> +
> +	if (!is_kfd_process_device_valid(pdd))
> +		return -EINVAL;
> +
>   	args->available = amdgpu_amdkfd_get_available_memory(pdd->dev->adev,
>   							pdd->dev->node_id);
>   	kfd_unlock_pdd(pdd);
> @@ -1090,6 +1128,11 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
>   		goto err_pdd;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		err = -EINVAL;
> +		goto err_pdd;
> +	}
> +
>   	dev = pdd->dev;
>   
>   	if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&
> @@ -1202,6 +1245,12 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
>   		goto err_pdd;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		pr_err("Process device is not valid\n");
> +		ret = -EINVAL;
> +		goto err_pdd;
> +	}
> +
>   	mem = kfd_process_device_translate_handle(
>   		pdd, GET_IDR_HANDLE(args->handle));
>   	if (!mem) {
> @@ -1266,6 +1315,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
>   		err = -EINVAL;
>   		goto get_process_device_data_failed;
>   	}
> +
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		err = -EINVAL;
> +		goto get_process_device_data_failed;
> +	}
> +
>   	dev = pdd->dev;
>   
>   	pdd = kfd_bind_process_to_device(dev, p);
> @@ -1384,6 +1439,11 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   		goto bind_process_to_device_failed;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		err = -EINVAL;
> +		goto bind_process_to_device_failed;
> +	}
> +
>   	mem = kfd_process_device_translate_handle(pdd,
>   						GET_IDR_HANDLE(args->handle));
>   	if (!mem) {
> @@ -1567,6 +1627,11 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
>   		goto err_unlock;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		r = PTR_ERR(pdd);
> +		goto err_unlock;
> +	}
> +
>   	r = amdgpu_amdkfd_gpuvm_import_dmabuf_fd(pdd->dev->adev, args->dmabuf_fd,
>   						 args->va_addr, pdd->drm_priv,
>   						 (struct kgd_mem **)&mem, &size,
> @@ -1616,6 +1681,11 @@ static int kfd_ioctl_export_dmabuf(struct file *filep,
>   		goto err_unlock;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		ret = -EINVAL;
> +		goto err_unlock;
> +	}
> +
>   	mem = kfd_process_device_translate_handle(pdd,
>   						GET_IDR_HANDLE(args->handle));
>   	if (!mem) {
> @@ -1660,6 +1730,9 @@ static int kfd_ioctl_smi_events(struct file *filep,
>   	if (!pdd)
>   		return -EINVAL;
>   
> +	if (!is_kfd_process_device_valid(pdd))
> +		return -EINVAL;
> +
>   	return kfd_smi_event_open(pdd->dev, &args->anon_fd);
>   }
>   
> @@ -2990,6 +3063,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   			r = -ENODEV;
>   			goto unlock_out;
>   		}
> +
> +		if (!is_kfd_process_device_valid(pdd)) {
> +			r = -ENODEV;
> +			goto unlock_out;
> +		}
>   	}
>   
>   	switch (args->op) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index fad1c8f2bc83..019567249110 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -893,6 +893,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>   	svm_range_set_max_pages(kfd->adev);
>   
>   	kfd->init_complete = true;
> +	kfd->valid = true;
>   	dev_info(kfd_device, "added device %x:%x\n", kfd->adev->pdev->vendor,
>   		 kfd->adev->pdev->device);
>   
> @@ -919,6 +920,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>   
>   void kgd2kfd_device_exit(struct kfd_dev *kfd)
>   {
> +	struct kfd_process *p;
> +	unsigned int i, j;
> +	unsigned int temp;
> +
>   	if (kfd->init_complete) {
>   		/* Cleanup KFD nodes */
>   		kfd_cleanup_nodes(kfd, kfd->num_nodes);
> @@ -929,6 +934,20 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>   		amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
>   	}
>   
> +	/* now this kfd_dev has been completely removed from kfd driver
> +	 * before kfree kfd iterate all existing kfd processes, if kfd process
> +	 * uses any kfd node from this kfd set its ref to NULL
> +	 */
> +	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> +		for (i = 0; i < kfd->num_nodes; i++)
> +			for (j = 0; j < p->n_pdds; j++) {
> +				if (kfd->nodes[i] == p->pdds[j]->dev) {
> +					p->pdds[j]->dev = NULL;

Could this be done in teardown_kfd_device? Then you may not need a 
separate "valid" for is_kfd_process_device_valid. And any accidental 
access to a device associated with an invalid pdd would automatically 
trigger a kernel error message with a backtrace.

> +					break;
> +				}
> +			}
> +	}
> +
>   	kfree(kfd);
>   }
>   
> @@ -1485,6 +1504,30 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
>   	return node->dqm->ops.halt(node->dqm);
>   }
>   
> +/* tear down this kfd deve */
> +void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd)
> +{
> +	struct kfd_process *p;
> +	struct kfd_node *dev;
> +	unsigned int i;
> +	unsigned int temp;
> +
> +	kfd->valid = false;
> +	/* stop queues from kfd nodes in this kfd dev */
> +	for (i = 0; i < kfd->num_nodes; i++) {
> +		dev = kfd->nodes[i];
> +		dev->dqm->ops.stop(dev->dqm);
> +	}

If the GPU was unplugged already, what's the point of this? Won't this 
trigger a timeout?

> +
> +	/* signal a gpu device is being teared down to user spalce processes by
> +	 * KFD_EVENT_TYPE_HW_EXCEPTION event
> +	 */
> +	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes)
> +		kfd_signal_hw_exception_event(p->pasid);

This sends exceptions to all processes. It should only do this for 
processes that use the unplugged device (i.e. have a pdd that uses the 
device). This excludes processes that don't have the device in their cgroup.

> +
> +	return;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   /* This function will send a package to HIQ to hang the HWS
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
> index dbcb60eb54b2..b8dd80ee17be 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
> @@ -378,6 +378,12 @@ int kfd_init_apertures(struct kfd_process *process)
>   			continue;
>   		}
>   
> +		/* kfd device that this kfd node belogns is not valid */
> +		if (!dev->kfd->valid) {
> +			id++;
> +			continue;
> +		}
> +
>   		pdd = kfd_create_process_device_data(dev, process);
>   		if (!pdd) {
>   			dev_err(dev->adev->dev,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 6a5bf88cc232..97e7692ce569 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -371,6 +371,9 @@ struct kfd_dev {
>   
>   	/* bitmap for dynamic doorbell allocation from doorbell object */
>   	unsigned long *doorbell_bitmap;
> +
> +	/* this kfd_dev valid or not */
> +	bool valid;
>   };
>   
>   enum kfd_mempool {
> @@ -1055,6 +1058,10 @@ int kfd_process_restore_queues(struct kfd_process *p);
>   void kfd_suspend_all_processes(void);
>   int kfd_resume_all_processes(void);
>   
> +static inline bool is_kfd_process_device_valid(struct kfd_process_device *pdd) {
> +	return (pdd && pdd->dev && pdd->dev->kfd && pdd->dev->kfd->valid);
> +}
> +
>   struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *process,
>   							 uint32_t gpu_id);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index d07acf1b2f93..c06eb9d8008e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1157,8 +1157,6 @@ static void kfd_process_wq_release(struct work_struct *work)
>   	ef = rcu_access_pointer(p->ef);
>   	dma_fence_signal(ef);
>   
> -	kfd_process_remove_sysfs(p);
> -
>   	kfd_process_kunmap_signal_bo(p);
>   	kfd_process_free_outstanding_kfd_bos(p);
>   	svm_range_list_fini(p);
> @@ -1173,6 +1171,11 @@ static void kfd_process_wq_release(struct work_struct *work)
>   
>   	put_task_struct(p->lead_thread);
>   
> +	/* the last step is removing process entries under /sys
> +	 * to indicate the process has been terminated.
> +	 */

This comment doesn't provide any useful information. What would be 
useful is, why this needs to be the last step? Without that, I see no 
good reason for this change.

> +	kfd_process_remove_sysfs(p);
> +
>   	kfree(p);
>   }
>   
> @@ -1536,6 +1539,12 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   	if (err != 0)
>   		goto err_init_apertures;
>   
> +	/* no any kfd_process_device can be created */
> +	if (!process->n_pdds) {
> +		err = -ENODEV;
> +		goto err_init_apertures;
> +	}
> +
>   	/* Check XNACK support after PDDs are created in kfd_init_apertures */
>   	process->xnack_enabled = kfd_process_xnack_mode(process, false);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index c76db22a1000..eaf4ba65466c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -124,6 +124,11 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
>   		return -EINVAL;
>   	}
>   
> +	if (!is_kfd_process_device_valid(pdd)) {
> +		pr_debug("device 0x%x is not available\n",dev->node_id);
> +		return -EINVAL;
> +	}
> +
>   	/* Only allow one queue per process can have GWS assigned */
>   	if (gws && pdd->qpd.num_gws)
>   		return -EBUSY;
> @@ -498,6 +503,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   	if (WARN_ON(!dev))
>   		return -ENODEV;
>   
> +	if (!dev->kfd || !dev->kfd->valid) {
> +		pr_err("Process device is not valid\n");

Would you expect to see this message during process termination after a 
hot-unplug? Should this really be an error message, or would an info or 
debug message be more appropriate?

> +		return -1;

This should be a proper error code. -1 is -EPERM.

> +	}
> +
>   	pdd = kfd_get_process_device_data(dev, pqm->process);
>   	if (!pdd) {
>   		pr_err("Process device data doesn't exist\n");
> @@ -567,6 +577,10 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm,
>   		pdd = kfd_get_process_device_data(q->device, q->process);
>   		if (!pdd)
>   			return -ENODEV;
> +
> +		if (!is_kfd_process_device_valid(pdd))
> +			return -ENODEV;
> + 		vm = drm_priv_to_vm(pdd->drm_priv);
>   		err = amdgpu_bo_reserve(vm->root.bo, false);
>   		if (err)
> @@ -612,6 +626,11 @@ int pqm_update_mqd(struct process_queue_manager *pqm,
>   		return -EFAULT;
>   	}
>   
> +	if (!pqn->q->device->kfd->valid) {
> +		pr_debug("device where queue %d exists is not valid\n", qid);
> +		return -EFAULT;
> +	}
> +
>   	/* CUs are masked for debugger requirements so deny user mask  */
>   	if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr)
>   		return -EBUSY;
> @@ -679,6 +698,11 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
>   		return -EFAULT;
>   	}
>   
> +	if (!pqn->q->device->kfd->valid) {
> +		pr_debug("device where queue %d exists is not valid\n", qid);
> +		return -EFAULT;

EFAULT means "bad address". Probably not the right error code here.

Regards,
   Felix

> +	}
> +
>   	return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm,
>   						       pqn->q,
>   						       ctl_stack,