[PATCH v2] drm/amdkfd: Add kfd driver function to support hot plug/unplug amdgpu devices
Chen, Xiaogang
xiaogang.chen at amd.com
Fri Oct 18 14:09:23 UTC 2024
On 10/17/2024 4:04 PM, Felix Kuehling wrote:
>
> On 2024-10-15 17:21, Xiaogang.Chen wrote:
>> From: Xiaogang Chen <xiaogang.chen at amd.com>
>>
>> The purpose of this patch is having kfd driver function as expected
>> during AMD
>> gpu device plug/unplug.
>>
>> When an AMD gpu device got unplug kfd driver stops all queues from
>> this device.
>> If there are user processes still ref the render node this device is
>> marked as
>> invalid. kfd driver will return error to following requests to the
>> device from
>> all existing user processes. Existing user processes can still use
>> remaining
>> gpu devices during/after unplug event.
>>
>> After all refs to the device have been closed from user space kfd driver
>> topology got updated by removing correspodent kfd nodes.
>>
>> User space can use remaining gpu devices that are valid at same time.
>> When all
>> AMD gpu devices got removed kfd driver will not allow open /dev/kfd
>> request.
>>
>> Unplugged AMD gpu devices can be re-plugged. kfd driver will use
>> added devices
>> and function as usual.
>>
>> Signed-off-by: Xiaogang Chen<Xiaogang.Chen at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 ++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 ++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-
>> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 78 +++++++++++++++++++
>> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 43 ++++++++++
>> drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 6 ++
>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 7 ++
>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 13 +++-
>> .../amd/amdkfd/kfd_process_queue_manager.c | 24 ++++++
>> 9 files changed, 183 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index b545940e512b..651ae0775f80 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -248,6 +248,11 @@ void amdgpu_amdkfd_interrupt(struct
>> amdgpu_device *adev,
>> kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
>> }
>> +void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd)
>> +{
>> + kgd2kfd_teardown_kfd_device(kfd);
>> +}
>> +
>> void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
>> {
>> if (adev->kfd.dev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 7e0a22072536..bd241f569b79 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -152,6 +152,7 @@ struct amdkfd_process_info {
>> int amdgpu_amdkfd_init(void);
>> void amdgpu_amdkfd_fini(void);
>> +void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd);
>> void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
>> int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
>> @@ -431,6 +432,7 @@ int kgd2kfd_check_and_lock_kfd(void);
>> void kgd2kfd_unlock_kfd(void);
>> int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
>> int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
>> +void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd);
>> #else
>> static inline int kgd2kfd_init(void)
>> {
>> @@ -511,5 +513,10 @@ static inline int kgd2kfd_stop_sched(struct
>> kfd_dev *kfd, uint32_t node_id)
>> {
>> return 0;
>> }
>> +
>> +void kgd2kfd_teardown_processes(void)
>> +{
>> +}
>> +
>> #endif
>> #endif /* AMDGPU_AMDKFD_H_INCLUDED */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 1e47655e02c6..4529d7a88b98 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3315,7 +3315,8 @@ static int amdgpu_device_ip_fini_early(struct
>> amdgpu_device *adev)
>> amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>> amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>> - amdgpu_amdkfd_suspend(adev, false);
>> + if (adev->kfd.dev)
>> + amdgpu_amdkfd_teardown_kfd_device(adev->kfd.dev);
>> /* Workaroud for ASICs need to disable SMC first */
>> amdgpu_device_smu_fini_early(adev);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index a1f191a5984b..d246f72ae0e9 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -327,6 +327,13 @@ static int kfd_ioctl_create_queue(struct file
>> *filep, struct kfd_process *p,
>> err = -EINVAL;
>> goto err_pdd;
>> }
>> +
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + pr_debug("gpu 0x%x is not available\n", args->gpu_id);
>> + err = -EINVAL;
>> + goto err_pdd;
>> + }
>> +
>
> Instead of duplicating this in all the ioctl functions, could this
> check be done in kfd_process_device_data_by_id?
Yes, that makes code simpler. Also, need add same check on
kfd_get_process_device_data.
>
>
>> dev = pdd->dev;
>> pdd = kfd_bind_process_to_device(dev, p);
>> @@ -578,6 +585,12 @@ static int kfd_ioctl_set_memory_policy(struct
>> file *filep,
>> goto err_pdd;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + pr_debug("gpu 0x%x is not available\n", args->gpu_id);
>> + err = -EINVAL;
>> + goto err_pdd;
>> + }
>> +
>> pdd = kfd_bind_process_to_device(pdd->dev, p);
>> if (IS_ERR(pdd)) {
>> err = -ESRCH;
>> @@ -621,6 +634,11 @@ static int kfd_ioctl_set_trap_handler(struct
>> file *filep,
>> goto err_pdd;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + err = -EINVAL;
>> + goto err_pdd;
>> + }
>> +
>> pdd = kfd_bind_process_to_device(pdd->dev, p);
>> if (IS_ERR(pdd)) {
>> err = -ESRCH;
>> @@ -704,6 +722,9 @@ static int kfd_ioctl_get_process_apertures(struct
>> file *filp,
>> for (i = 0; i < p->n_pdds; i++) {
>> struct kfd_process_device *pdd = p->pdds[i];
>> + if (!is_kfd_process_device_valid(pdd))
>> + continue;
>> +
>> pAperture =
>> &args->process_apertures[args->num_of_nodes];
>> pAperture->gpu_id = pdd->dev->id;
>> @@ -779,6 +800,9 @@ static int
>> kfd_ioctl_get_process_apertures_new(struct file *filp,
>> for (i = 0; i < min(p->n_pdds, args->num_of_nodes); i++) {
>> struct kfd_process_device *pdd = p->pdds[i];
>> + if (!is_kfd_process_device_valid(pdd))
>> + continue;
>> +
>> pa[i].gpu_id = pdd->dev->id;
>> pa[i].lds_base = pdd->lds_base;
>> pa[i].lds_limit = pdd->lds_limit;
>> @@ -901,6 +925,11 @@ static int
>> kfd_ioctl_set_scratch_backing_va(struct file *filep,
>> goto bind_process_to_device_fail;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + err = PTR_ERR(pdd);
>> + goto bind_process_to_device_fail;
>> + }
>> +
>> pdd->qpd.sh_hidden_private_base = args->va_addr;
>> mutex_unlock(&p->mutex);
>> @@ -981,6 +1010,11 @@ static int kfd_ioctl_acquire_vm(struct file
>> *filep, struct kfd_process *p,
>> goto err_pdd;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + ret = -EINVAL;
>> + goto err_pdd;
>> + }
>> +
>> if (pdd->drm_file) {
>> ret = pdd->drm_file == drm_file ? 0 : -EBUSY;
>> goto err_drm_file;
>> @@ -1031,6 +1065,10 @@ static int
>> kfd_ioctl_get_available_memory(struct file *filep,
>> if (!pdd)
>> return -EINVAL;
>> +
>> + if (!is_kfd_process_device_valid(pdd))
>> + return -EINVAL;
>> +
>> args->available =
>> amdgpu_amdkfd_get_available_memory(pdd->dev->adev,
>> pdd->dev->node_id);
>> kfd_unlock_pdd(pdd);
>> @@ -1090,6 +1128,11 @@ static int
>> kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
>> goto err_pdd;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + err = -EINVAL;
>> + goto err_pdd;
>> + }
>> +
>> dev = pdd->dev;
>> if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&
>> @@ -1202,6 +1245,12 @@ static int kfd_ioctl_free_memory_of_gpu(struct
>> file *filep,
>> goto err_pdd;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + pr_err("Process device is not valid\n");
>> + ret = -EINVAL;
>> + goto err_pdd;
>> + }
>> +
>> mem = kfd_process_device_translate_handle(
>> pdd, GET_IDR_HANDLE(args->handle));
>> if (!mem) {
>> @@ -1266,6 +1315,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct
>> file *filep,
>> err = -EINVAL;
>> goto get_process_device_data_failed;
>> }
>> +
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + err = -EINVAL;
>> + goto get_process_device_data_failed;
>> + }
>> +
>> dev = pdd->dev;
>> pdd = kfd_bind_process_to_device(dev, p);
>> @@ -1384,6 +1439,11 @@ static int
>> kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>> goto bind_process_to_device_failed;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + err = -EINVAL;
>> + goto bind_process_to_device_failed;
>> + }
>> +
>> mem = kfd_process_device_translate_handle(pdd,
>> GET_IDR_HANDLE(args->handle));
>> if (!mem) {
>> @@ -1567,6 +1627,11 @@ static int kfd_ioctl_import_dmabuf(struct file
>> *filep,
>> goto err_unlock;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + r = PTR_ERR(pdd);
>> + goto err_unlock;
>> + }
>> +
>> r = amdgpu_amdkfd_gpuvm_import_dmabuf_fd(pdd->dev->adev,
>> args->dmabuf_fd,
>> args->va_addr, pdd->drm_priv,
>> (struct kgd_mem **)&mem, &size,
>> @@ -1616,6 +1681,11 @@ static int kfd_ioctl_export_dmabuf(struct file
>> *filep,
>> goto err_unlock;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + ret = -EINVAL;
>> + goto err_unlock;
>> + }
>> +
>> mem = kfd_process_device_translate_handle(pdd,
>> GET_IDR_HANDLE(args->handle));
>> if (!mem) {
>> @@ -1660,6 +1730,9 @@ static int kfd_ioctl_smi_events(struct file
>> *filep,
>> if (!pdd)
>> return -EINVAL;
>> + if (!is_kfd_process_device_valid(pdd))
>> + return -EINVAL;
>> +
>> return kfd_smi_event_open(pdd->dev, &args->anon_fd);
>> }
>> @@ -2990,6 +3063,11 @@ static int kfd_ioctl_set_debug_trap(struct
>> file *filep, struct kfd_process *p, v
>> r = -ENODEV;
>> goto unlock_out;
>> }
>> +
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + r = -ENODEV;
>> + goto unlock_out;
>> + }
>> }
>> switch (args->op) {
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index fad1c8f2bc83..019567249110 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -893,6 +893,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>> svm_range_set_max_pages(kfd->adev);
>> kfd->init_complete = true;
>> + kfd->valid = true;
>> dev_info(kfd_device, "added device %x:%x\n",
>> kfd->adev->pdev->vendor,
>> kfd->adev->pdev->device);
>> @@ -919,6 +920,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>> void kgd2kfd_device_exit(struct kfd_dev *kfd)
>> {
>> + struct kfd_process *p;
>> + unsigned int i, j;
>> + unsigned int temp;
>> +
>> if (kfd->init_complete) {
>> /* Cleanup KFD nodes */
>> kfd_cleanup_nodes(kfd, kfd->num_nodes);
>> @@ -929,6 +934,20 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>> amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
>> }
>> + /* now this kfd_dev has been completely removed from kfd driver
>> + * before kfree kfd iterate all existing kfd processes, if kfd
>> process
>> + * uses any kfd node from this kfd set its ref to NULL
>> + */
>> + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>> + for (i = 0; i < kfd->num_nodes; i++)
>> + for (j = 0; j < p->n_pdds; j++) {
>> + if (kfd->nodes[i] == p->pdds[j]->dev) {
>> + p->pdds[j]->dev = NULL;
>
> Could this be done in teardown_kfd_device? Then you may not need a
> separate "valid" for is_kfd_process_device_valid. And any accidental
> access to a device associated with an invalid pdd would automatically
> trigger a kernel error message with a backtrace.
At teardown_kfd_device the adev has not been removed, ex: there are user
apps still refer the render node. kfd dev(kfd nodes) is released at
kfd_cleanup_nodes of kgd2kfd_device_exit when all ref to adev got
released(user apps close render node). During that time kfd
nodes(pdd->dev) are valid. We still can access kfd nodes data structure
though their queues got stopped and kfd_node->kfd has been markded as
invalid.
>
>
>> + break;
>> + }
>> + }
>> + }
>> +
>> kfree(kfd);
>> }
>> @@ -1485,6 +1504,30 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd,
>> uint32_t node_id)
>> return node->dqm->ops.halt(node->dqm);
>> }
>> +/* tear down this kfd deve */
>> +void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd)
>> +{
>> + struct kfd_process *p;
>> + struct kfd_node *dev;
>> + unsigned int i;
>> + unsigned int temp;
>> +
>> + kfd->valid = false;
>> + /* stop queues from kfd nodes in this kfd dev */
>> + for (i = 0; i < kfd->num_nodes; i++) {
>> + dev = kfd->nodes[i];
>> + dev->dqm->ops.stop(dev->dqm);
>> + }
>
> If the GPU was unplugged already, what's the point of this? Won't this
> trigger a timeout?
>
pci base driver will find that the device has been unplugged, will not
call amdgpu driver's callback for pci device that has been removed. So
that would not happen.
>
>> +
>> + /* signal a gpu device is being teared down to user spalce
>> processes by
>> + * KFD_EVENT_TYPE_HW_EXCEPTION event
>> + */
>> + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes)
>> + kfd_signal_hw_exception_event(p->pasid);
>
> This sends exceptions to all processes. It should only do this for
> processes that use the unplugged device (i.e. have a pdd that uses the
> device). This excludes processes that don't have the device in their
> cgroup.
ok, will iterate all existing kfd processes. If any kfd node from this
kfd dev got used by a kfd process send the event to correspondent user
process.
>
>
>> +
>> + return;
>> +}
>> +
>> #if defined(CONFIG_DEBUG_FS)
>> /* This function will send a package to HIQ to hang the HWS
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>> index dbcb60eb54b2..b8dd80ee17be 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>> @@ -378,6 +378,12 @@ int kfd_init_apertures(struct kfd_process *process)
>> continue;
>> }
>> + /* kfd device that this kfd node belogns is not valid */
>> + if (!dev->kfd->valid) {
>> + id++;
>> + continue;
>> + }
>> +
>> pdd = kfd_create_process_device_data(dev, process);
>> if (!pdd) {
>> dev_err(dev->adev->dev,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 6a5bf88cc232..97e7692ce569 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -371,6 +371,9 @@ struct kfd_dev {
>> /* bitmap for dynamic doorbell allocation from doorbell
>> object */
>> unsigned long *doorbell_bitmap;
>> +
>> + /* this kfd_dev valid or not */
>> + bool valid;
>> };
>> enum kfd_mempool {
>> @@ -1055,6 +1058,10 @@ int kfd_process_restore_queues(struct
>> kfd_process *p);
>> void kfd_suspend_all_processes(void);
>> int kfd_resume_all_processes(void);
>> +static inline bool is_kfd_process_device_valid(struct
>> kfd_process_device *pdd) {
>> + return (pdd && pdd->dev && pdd->dev->kfd && pdd->dev->kfd->valid);
>> +}
>> +
>> struct kfd_process_device *kfd_process_device_data_by_id(struct
>> kfd_process *process,
>> uint32_t gpu_id);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index d07acf1b2f93..c06eb9d8008e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -1157,8 +1157,6 @@ static void kfd_process_wq_release(struct
>> work_struct *work)
>> ef = rcu_access_pointer(p->ef);
>> dma_fence_signal(ef);
>> - kfd_process_remove_sysfs(p);
>> -
>> kfd_process_kunmap_signal_bo(p);
>> kfd_process_free_outstanding_kfd_bos(p);
>> svm_range_list_fini(p);
>> @@ -1173,6 +1171,11 @@ static void kfd_process_wq_release(struct
>> work_struct *work)
>> put_task_struct(p->lead_thread);
>> + /* the last step is removing process entries under /sys
>> + * to indicate the process has been terminated.
>> + */
>
> This comment doesn't provide any useful information. What would be
> useful is, why this needs to be the last step? Without that, I see no
> good reason for this change.
ok, this change is not related to the patch. I thought it is better to
update kfd topology at last step after all kfd process resources got
released. I will remove this change.
>
>
>> + kfd_process_remove_sysfs(p);
>> +
>> kfree(p);
>> }
>> @@ -1536,6 +1539,12 @@ static struct kfd_process
>> *create_process(const struct task_struct *thread)
>> if (err != 0)
>> goto err_init_apertures;
>> + /* no any kfd_process_device can be created */
>> + if (!process->n_pdds) {
>> + err = -ENODEV;
>> + goto err_init_apertures;
>> + }
>> +
>> /* Check XNACK support after PDDs are created in
>> kfd_init_apertures */
>> process->xnack_enabled = kfd_process_xnack_mode(process, false);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index c76db22a1000..eaf4ba65466c 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -124,6 +124,11 @@ int pqm_set_gws(struct process_queue_manager
>> *pqm, unsigned int qid,
>> return -EINVAL;
>> }
>> + if (!is_kfd_process_device_valid(pdd)) {
>> + pr_debug("device 0x%x is not available\n",dev->node_id);
>> + return -EINVAL;
>> + }
>> +
>> /* Only allow one queue per process can have GWS assigned */
>> if (gws && pdd->qpd.num_gws)
>> return -EBUSY;
>> @@ -498,6 +503,11 @@ int pqm_destroy_queue(struct
>> process_queue_manager *pqm, unsigned int qid)
>> if (WARN_ON(!dev))
>> return -ENODEV;
>> + if (!dev->kfd || !dev->kfd->valid) {
>> + pr_err("Process device is not valid\n");
>
> Would you expect to see this message during process termination after
> a hot-unplug? Should this really be an error message, or would an info
> or debug message be more appropriate?
I will change kfd_get_process_device_data that will include this check,
then this message will be merged at !pdd case.
>
>
>> + return -1;
>
> This should be a proper error code. -1 is -EPERM.
>
Same as above.
>
>> + }
>> +
>> pdd = kfd_get_process_device_data(dev, pqm->process);
>> if (!pdd) {
>> pr_err("Process device data doesn't exist\n");
>> @@ -567,6 +577,10 @@ int pqm_update_queue_properties(struct
>> process_queue_manager *pqm,
>> pdd = kfd_get_process_device_data(q->device, q->process);
>> if (!pdd)
>> return -ENODEV;
>> +
>> + if (!is_kfd_process_device_valid(pdd))
>> + return -ENODEV;
>> + vm = drm_priv_to_vm(pdd->drm_priv);
>> err = amdgpu_bo_reserve(vm->root.bo, false);
>> if (err)
>> @@ -612,6 +626,11 @@ int pqm_update_mqd(struct process_queue_manager
>> *pqm,
>> return -EFAULT;
>> }
>> + if (!pqn->q->device->kfd->valid) {
>> + pr_debug("device where queue %d exists is not valid\n", qid);
>> + return -EFAULT;
>> + }
>> +
>> /* CUs are masked for debugger requirements so deny user mask */
>> if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr)
>> return -EBUSY;
>> @@ -679,6 +698,11 @@ int pqm_get_wave_state(struct
>> process_queue_manager *pqm,
>> return -EFAULT;
>> }
>> + if (!pqn->q->device->kfd->valid) {
>> + pr_debug("device where queue %d exists is not valid\n", qid);
>> + return -EFAULT;
>
> EFAULT means "bad address". Probably not the right error code here.
Will use -EINVAL.
Thanks
Xiaogang
>
> Regards,
> Felix
>
>
>> + }
>> +
>> return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm,
>> pqn->q,
>> ctl_stack,
More information about the amd-gfx
mailing list