[PATCH v2] drm/amdkfd: Add kfd driver function to support hot plug/unplug amdgpu devices

Fri Oct 18 22:07:54 UTC 2024

On 2024-10-18 17:31, Chen, Xiaogang wrote:
>
> On 10/18/2024 12:57 PM, Felix Kuehling wrote:
>>
>> On 2024-10-18 10:09, Chen, Xiaogang wrote:
>>>
>>> On 10/17/2024 4:04 PM, Felix Kuehling wrote:
>>>>
>>>> On 2024-10-15 17:21, Xiaogang.Chen wrote:
>>>>> From: Xiaogang Chen <xiaogang.chen at amd.com>
>>>>>
>>>>> The purpose of this patch is having kfd driver function as 
>>>>> expected during AMD
>>>>> gpu device plug/unplug.
>>>>>
>>>>> When an AMD gpu device got unplug kfd driver stops all queues from 
>>>>> this device.
>>>>> If there are user processes still ref the render node this device 
>>>>> is marked as
>>>>> invalid. kfd driver will return error to following requests to the 
>>>>> device from
>>>>> all existing user processes. Existing user processes can still use 
>>>>> remaining
>>>>> gpu devices during/after unplug event.
>>>>>
>>>>> After all refs to the device have been closed from user space kfd 
>>>>> driver
>>>>> topology got updated by removing correspodent kfd nodes.
>>>>>
>>>>> User space can use remaining gpu devices that are valid at same 
>>>>> time. When all
>>>>> AMD gpu devices got removed kfd driver will not allow open 
>>>>> /dev/kfd request.
>>>>>
>>>>> Unplugged AMD gpu devices can be re-plugged. kfd driver will use 
>>>>> added devices
>>>>> and function as usual.
>>>>>
>>>>> Signed-off-by: Xiaogang Chen<Xiaogang.Chen at amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  5 ++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  7 ++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  3 +-
>>>>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 78 
>>>>> +++++++++++++++++++
>>>>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       | 43 ++++++++++
>>>>>   drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c  |  6 ++
>>>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  7 ++
>>>>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 13 +++-
>>>>>   .../amd/amdkfd/kfd_process_queue_manager.c    | 24 ++++++
>>>>>   9 files changed, 183 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>>>> index b545940e512b..651ae0775f80 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>>>> @@ -248,6 +248,11 @@ void amdgpu_amdkfd_interrupt(struct 
>>>>> amdgpu_device *adev,
>>>>>           kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
>>>>>   }
>>>>>   +void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd)
>>>>> +{
>>>>> +       kgd2kfd_teardown_kfd_device(kfd);
>>>>> +}
>>>>> +
>>>>>   void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
>>>>>   {
>>>>>       if (adev->kfd.dev)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>> index 7e0a22072536..bd241f569b79 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>> @@ -152,6 +152,7 @@ struct amdkfd_process_info {
>>>>>     int amdgpu_amdkfd_init(void);
>>>>>   void amdgpu_amdkfd_fini(void);
>>>>> +void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd);
>>>>>     void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool 
>>>>> run_pm);
>>>>>   int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
>>>>> @@ -431,6 +432,7 @@ int kgd2kfd_check_and_lock_kfd(void);
>>>>>   void kgd2kfd_unlock_kfd(void);
>>>>>   int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
>>>>>   int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
>>>>> +void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd);
>>>>>   #else
>>>>>   static inline int kgd2kfd_init(void)
>>>>>   {
>>>>> @@ -511,5 +513,10 @@ static inline int kgd2kfd_stop_sched(struct 
>>>>> kfd_dev *kfd, uint32_t node_id)
>>>>>   {
>>>>>       return 0;
>>>>>   }
>>>>> +
>>>>> +void kgd2kfd_teardown_processes(void)
>>>>> +{
>>>>> +}
>>>>> +
>>>>>   #endif
>>>>>   #endif /* AMDGPU_AMDKFD_H_INCLUDED */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 1e47655e02c6..4529d7a88b98 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -3315,7 +3315,8 @@ static int 
>>>>> amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
>>>>>       amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>       amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>>   -    amdgpu_amdkfd_suspend(adev, false);
>>>>> +    if (adev->kfd.dev)
>>>>> + amdgpu_amdkfd_teardown_kfd_device(adev->kfd.dev);
>>>>>         /* Workaroud for ASICs need to disable SMC first */
>>>>>       amdgpu_device_smu_fini_early(adev);
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> index a1f191a5984b..d246f72ae0e9 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> @@ -327,6 +327,13 @@ static int kfd_ioctl_create_queue(struct file 
>>>>> *filep, struct kfd_process *p,
>>>>>           err = -EINVAL;
>>>>>           goto err_pdd;
>>>>>       }
>>>>> +
>>>>> +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        pr_debug("gpu 0x%x is not available\n", args->gpu_id);
>>>>> +        err = -EINVAL;
>>>>> +        goto err_pdd;
>>>>> +    }
>>>>> +
>>>>
>>>> Instead of duplicating this in all the ioctl functions, could this 
>>>> check be done in kfd_process_device_data_by_id?
>>>
>>> Yes, that makes code simpler. Also, need add same check on 
>>> kfd_get_process_device_data.
>>
>> Maybe not. kfd_get_process_device_data gets a kfd_node as parameter, 
>> that callers typically get from get from a call to kfd_device_by_id. 
>> Maybe the check should be in kfd_get_device_by_id so it doesn't 
>> return invalid devices.
>
> kfd_get_process_device_data is used by queue operations. They use "dev 
> = pqn->q->device" to locate kfd node from queue structure, not 
> kfd_device_by_id. Then locate pdd.
>
> and yes,  at kfd_device_by_id we also need check returned 
> dev->kfd->valid.
>
> dev->kfd->valid means this kfd device cannot be used now(or being 
> removed). Its resources(kfd_dev, kfd_node) are not released yet until 
> user apps release all refs to the adev.
>
>>
>>
>>>
>>>>
>>>>
>>>>>       dev = pdd->dev;
>>>>>         pdd = kfd_bind_process_to_device(dev, p);
>>>>> @@ -578,6 +585,12 @@ static int kfd_ioctl_set_memory_policy(struct 
>>>>> file *filep,
>>>>>           goto err_pdd;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        pr_debug("gpu 0x%x is not available\n", args->gpu_id);
>>>>> +        err = -EINVAL;
>>>>> +        goto err_pdd;
>>>>> +    }
>>>>> +
>>>>>       pdd = kfd_bind_process_to_device(pdd->dev, p);
>>>>>       if (IS_ERR(pdd)) {
>>>>>           err = -ESRCH;
>>>>> @@ -621,6 +634,11 @@ static int kfd_ioctl_set_trap_handler(struct 
>>>>> file *filep,
>>>>>           goto err_pdd;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        err = -EINVAL;
>>>>> +        goto err_pdd;
>>>>> +    }
>>>>> +
>>>>>       pdd = kfd_bind_process_to_device(pdd->dev, p);
>>>>>       if (IS_ERR(pdd)) {
>>>>>           err = -ESRCH;
>>>>> @@ -704,6 +722,9 @@ static int 
>>>>> kfd_ioctl_get_process_apertures(struct file *filp,
>>>>>       for (i = 0; i < p->n_pdds; i++) {
>>>>>           struct kfd_process_device *pdd = p->pdds[i];
>>>>>   +        if (!is_kfd_process_device_valid(pdd))
>>>>> +            continue;
>>>>> +
>>>>>           pAperture =
>>>>> &args->process_apertures[args->num_of_nodes];
>>>>>           pAperture->gpu_id = pdd->dev->id;
>>>>> @@ -779,6 +800,9 @@ static int 
>>>>> kfd_ioctl_get_process_apertures_new(struct file *filp,
>>>>>       for (i = 0; i < min(p->n_pdds, args->num_of_nodes); i++) {
>>>>>           struct kfd_process_device *pdd = p->pdds[i];
>>>>>   +        if (!is_kfd_process_device_valid(pdd))
>>>>> +            continue;
>>>>> +
>>>>>           pa[i].gpu_id = pdd->dev->id;
>>>>>           pa[i].lds_base = pdd->lds_base;
>>>>>           pa[i].lds_limit = pdd->lds_limit;
>>>>> @@ -901,6 +925,11 @@ static int 
>>>>> kfd_ioctl_set_scratch_backing_va(struct file *filep,
>>>>>           goto bind_process_to_device_fail;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        err = PTR_ERR(pdd);
>>>>> +        goto bind_process_to_device_fail;
>>>>> +    }
>>>>> +
>>>>>       pdd->qpd.sh_hidden_private_base = args->va_addr;
>>>>>         mutex_unlock(&p->mutex);
>>>>> @@ -981,6 +1010,11 @@ static int kfd_ioctl_acquire_vm(struct file 
>>>>> *filep, struct kfd_process *p,
>>>>>           goto err_pdd;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        ret = -EINVAL;
>>>>> +        goto err_pdd;
>>>>> +    }
>>>>> +
>>>>>       if (pdd->drm_file) {
>>>>>           ret = pdd->drm_file == drm_file ? 0 : -EBUSY;
>>>>>           goto err_drm_file;
>>>>> @@ -1031,6 +1065,10 @@ static int 
>>>>> kfd_ioctl_get_available_memory(struct file *filep,
>>>>>         if (!pdd)
>>>>>           return -EINVAL;
>>>>> +
>>>>> +    if (!is_kfd_process_device_valid(pdd))
>>>>> +        return -EINVAL;
>>>>> +
>>>>>       args->available = 
>>>>> amdgpu_amdkfd_get_available_memory(pdd->dev->adev,
>>>>>                               pdd->dev->node_id);
>>>>>       kfd_unlock_pdd(pdd);
>>>>> @@ -1090,6 +1128,11 @@ static int 
>>>>> kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
>>>>>           goto err_pdd;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        err = -EINVAL;
>>>>> +        goto err_pdd;
>>>>> +    }
>>>>> +
>>>>>       dev = pdd->dev;
>>>>>         if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&
>>>>> @@ -1202,6 +1245,12 @@ static int 
>>>>> kfd_ioctl_free_memory_of_gpu(struct file *filep,
>>>>>           goto err_pdd;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        pr_err("Process device is not valid\n");
>>>>> +        ret = -EINVAL;
>>>>> +        goto err_pdd;
>>>>> +    }
>>>>> +
>>>>>       mem = kfd_process_device_translate_handle(
>>>>>           pdd, GET_IDR_HANDLE(args->handle));
>>>>>       if (!mem) {
>>>>> @@ -1266,6 +1315,12 @@ static int 
>>>>> kfd_ioctl_map_memory_to_gpu(struct file *filep,
>>>>>           err = -EINVAL;
>>>>>           goto get_process_device_data_failed;
>>>>>       }
>>>>> +
>>>>> +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        err = -EINVAL;
>>>>> +        goto get_process_device_data_failed;
>>>>> +    }
>>>>> +
>>>>>       dev = pdd->dev;
>>>>>         pdd = kfd_bind_process_to_device(dev, p);
>>>>> @@ -1384,6 +1439,11 @@ static int 
>>>>> kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>>>>>           goto bind_process_to_device_failed;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        err = -EINVAL;
>>>>> +        goto bind_process_to_device_failed;
>>>>> +    }
>>>>> +
>>>>>       mem = kfd_process_device_translate_handle(pdd,
>>>>> GET_IDR_HANDLE(args->handle));
>>>>>       if (!mem) {
>>>>> @@ -1567,6 +1627,11 @@ static int kfd_ioctl_import_dmabuf(struct 
>>>>> file *filep,
>>>>>           goto err_unlock;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        r = PTR_ERR(pdd);
>>>>> +        goto err_unlock;
>>>>> +    }
>>>>> +
>>>>>       r = amdgpu_amdkfd_gpuvm_import_dmabuf_fd(pdd->dev->adev, 
>>>>> args->dmabuf_fd,
>>>>>                            args->va_addr, pdd->drm_priv,
>>>>>                            (struct kgd_mem **)&mem, &size,
>>>>> @@ -1616,6 +1681,11 @@ static int kfd_ioctl_export_dmabuf(struct 
>>>>> file *filep,
>>>>>           goto err_unlock;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        ret = -EINVAL;
>>>>> +        goto err_unlock;
>>>>> +    }
>>>>> +
>>>>>       mem = kfd_process_device_translate_handle(pdd,
>>>>> GET_IDR_HANDLE(args->handle));
>>>>>       if (!mem) {
>>>>> @@ -1660,6 +1730,9 @@ static int kfd_ioctl_smi_events(struct file 
>>>>> *filep,
>>>>>       if (!pdd)
>>>>>           return -EINVAL;
>>>>>   +    if (!is_kfd_process_device_valid(pdd))
>>>>> +        return -EINVAL;
>>>>> +
>>>>>       return kfd_smi_event_open(pdd->dev, &args->anon_fd);
>>>>>   }
>>>>>   @@ -2990,6 +3063,11 @@ static int 
>>>>> kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>>>>>               r = -ENODEV;
>>>>>               goto unlock_out;
>>>>>           }
>>>>> +
>>>>> +        if (!is_kfd_process_device_valid(pdd)) {
>>>>> +            r = -ENODEV;
>>>>> +            goto unlock_out;
>>>>> +        }
>>>>>       }
>>>>>         switch (args->op) {
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>>> index fad1c8f2bc83..019567249110 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>>> @@ -893,6 +893,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>>>>>       svm_range_set_max_pages(kfd->adev);
>>>>>         kfd->init_complete = true;
>>>>> +    kfd->valid = true;
>>>>>       dev_info(kfd_device, "added device %x:%x\n", 
>>>>> kfd->adev->pdev->vendor,
>>>>>            kfd->adev->pdev->device);
>>>>>   @@ -919,6 +920,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>>>>>     void kgd2kfd_device_exit(struct kfd_dev *kfd)
>>>>>   {
>>>>> +    struct kfd_process *p;
>>>>> +    unsigned int i, j;
>>>>> +    unsigned int temp;
>>>>> +
>>>>>       if (kfd->init_complete) {
>>>>>           /* Cleanup KFD nodes */
>>>>>           kfd_cleanup_nodes(kfd, kfd->num_nodes);
>>>>> @@ -929,6 +934,20 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>>>>>           amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
>>>>>       }
>>>>>   +    /* now this kfd_dev has been completely removed from kfd 
>>>>> driver
>>>>> +     * before kfree kfd iterate all existing kfd processes, if 
>>>>> kfd process
>>>>> +     * uses any kfd node from this kfd set its ref to NULL
>>>>> +     */
>>>>> +    hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>>>>> +        for (i = 0; i < kfd->num_nodes; i++)
>>>>> +            for (j = 0; j < p->n_pdds; j++) {
>>>>> +                if (kfd->nodes[i] == p->pdds[j]->dev) {
>>>>> +                    p->pdds[j]->dev = NULL;
>>>>
>>>> Could this be done in teardown_kfd_device? Then you may not need a 
>>>> separate "valid" for is_kfd_process_device_valid. And any 
>>>> accidental access to a device associated with an invalid pdd would 
>>>> automatically trigger a kernel error message with a backtrace.
>>>
>>> At teardown_kfd_device the adev has not been removed, ex: there are 
>>> user apps still refer the render node. kfd dev(kfd nodes) is 
>>> released at kfd_cleanup_nodes of kgd2kfd_device_exit when all ref to 
>>> adev got released(user apps close render node). During that time kfd 
>>> nodes(pdd->dev) are valid. We still can access kfd nodes data 
>>> structure though their queues got stopped and kfd_node->kfd has been 
>>> markded as invalid.
>>
>> I'm not sure why that matters. The fact that the there are still 
>> other pointers to the dev doesn't mean the pointer in the pdd must 
>> remain valid.
>>
> Ex: when kfd process is released we call 
> kfd_process_dequeue_from_all_devices to terminate all queues(the 
> queues have been stopped after unplug the adev), we use pdd->dev to 
> locate kfd node for following "Clears all process queues belongs to 
> that device"
>
> At teardown_kfd_device the kfd dev(its kfd nodes) has not been 
> released. keeping pdd->dev is convenient when need locate the kfd dev 
> from pdd. Otherwise we need use other ways to locate kfd node.
>
>> On the other hand, there should be code in 
>> kgd2kfd_teardown_kfd_device or kgd2kfd_device_exit to clean up _all_ 
>> the other pointers to the invalid kfd_dev and kfd_nodes. AFAICT the 
>> kfd_dev and kfd_nodes are not reference counted, and there is no 
>> guarantee that these structures still exist by the time the processes 
>> terminate and run their cleanup code. You can't rely on 
>> kfd_dev->valid after the kfd_dev itself has been freed with kfree in 
>> kgd2kfd_device_exit. By that time all pointers to the kfd_dev and its 
>> nodes must have been cleaned up.
>>
>> A quick survey of the header files shows
>>
>>  * kfd_dev pointers in kfd_node, kfd_device_queue_manager
>>  * kfd_node pointers in kfd_dev, kfd_bo, queue, kernel_queue,
>>    kfd_process_device, kfd_*_properties referenced in lists in
>>    kfd_topology_device, svm_range_bo, mqd_manager
> I had same thought when did it. Here all queues on this kfd dev are 
> stopped at first, and all kfd nodes from kfd dev got marked as 
> invalid, so not serve any new request on these kfd nodes at api level. 
> Any existing operations that use kfd node check if it is valid.

That doesn't help if the kfd_node and kfd_dev structure were already 
freed with kfree. Before these structures are freed, you need to make 
sure that you don't have any dangling pointers to them in any of the 
data structures I listed, especially the per-process structures.

Regards,
   Felix

>>
>>
>>>
>>>>
>>>>
>>>>> +                    break;
>>>>> +                }
>>>>> +            }
>>>>> +    }
>>>>> +
>>>>>       kfree(kfd);
>>>>>   }
>>>>>   @@ -1485,6 +1504,30 @@ int kgd2kfd_stop_sched(struct kfd_dev 
>>>>> *kfd, uint32_t node_id)
>>>>>       return node->dqm->ops.halt(node->dqm);
>>>>>   }
>>>>>   +/* tear down this kfd deve */
>>>>> +void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd)
>>>>> +{
>>>>> +    struct kfd_process *p;
>>>>> +    struct kfd_node *dev;
>>>>> +    unsigned int i;
>>>>> +    unsigned int temp;
>>>>> +
>>>>> +    kfd->valid = false;
>>>>> +    /* stop queues from kfd nodes in this kfd dev */
>>>>> +    for (i = 0; i < kfd->num_nodes; i++) {
>>>>> +        dev = kfd->nodes[i];
>>>>> +        dev->dqm->ops.stop(dev->dqm);
>>>>> +    }
>>>>
>>>> If the GPU was unplugged already, what's the point of this? Won't 
>>>> this trigger a timeout?
>>>>
>>> pci base driver will find that the device has been unplugged, will 
>>> not call amdgpu driver's callback for pci device that has been 
>>> removed. So that would not happen.
>>
>> This has nothing to do with PCIe callbacks. dev->dqm->ops.stop tries 
>> to talk to the HWS firmware to remove queues. That will hand or time 
>> out if the GPU has been unplugged.
>
> Not sure understand that. Inside kgd2kfd_teardown_kfd_device the adev 
> has not been released(adev got released from pci system after 
> kgd2kfd_teardown_kfd_device return), so can do dev->dqm->ops.stop. If 
> user tries unplug same device again pci base driver will not find it, 
> then not call amdgpu driver. Misunderstanding?
>
> Thanks
>
> Xiaogang
>
>>
>> Regards,
>>   Felix
>>
>>
>>>>
>>>>> +
>>>>> +    /* signal a gpu device is being teared down to user spalce 
>>>>> processes by
>>>>> +     * KFD_EVENT_TYPE_HW_EXCEPTION event
>>>>> +     */
>>>>> +    hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes)
>>>>> +        kfd_signal_hw_exception_event(p->pasid);
>>>>
>>>> This sends exceptions to all processes. It should only do this for 
>>>> processes that use the unplugged device (i.e. have a pdd that uses 
>>>> the device). This excludes processes that don't have the device in 
>>>> their cgroup.
>>> ok, will iterate all existing kfd processes. If any kfd node from 
>>> this kfd dev got used by a kfd process send the event to 
>>> correspondent user process.
>>>>
>>>>
>>>>> +
>>>>> +    return;
>>>>> +}
>>>>> +
>>>>>   #if defined(CONFIG_DEBUG_FS)
>>>>>     /* This function will send a package to HIQ to hang the HWS
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>>>>> index dbcb60eb54b2..b8dd80ee17be 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
>>>>> @@ -378,6 +378,12 @@ int kfd_init_apertures(struct kfd_process 
>>>>> *process)
>>>>>               continue;
>>>>>           }
>>>>>   +        /* kfd device that this kfd node belogns is not valid */
>>>>> +        if (!dev->kfd->valid) {
>>>>> +            id++;
>>>>> +            continue;
>>>>> +        }
>>>>> +
>>>>>           pdd = kfd_create_process_device_data(dev, process);
>>>>>           if (!pdd) {
>>>>>               dev_err(dev->adev->dev,
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> index 6a5bf88cc232..97e7692ce569 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> @@ -371,6 +371,9 @@ struct kfd_dev {
>>>>>         /* bitmap for dynamic doorbell allocation from doorbell 
>>>>> object */
>>>>>       unsigned long *doorbell_bitmap;
>>>>> +
>>>>> +    /* this kfd_dev valid or not */
>>>>> +    bool valid;
>>>>>   };
>>>>>     enum kfd_mempool {
>>>>> @@ -1055,6 +1058,10 @@ int kfd_process_restore_queues(struct 
>>>>> kfd_process *p);
>>>>>   void kfd_suspend_all_processes(void);
>>>>>   int kfd_resume_all_processes(void);
>>>>>   +static inline bool is_kfd_process_device_valid(struct 
>>>>> kfd_process_device *pdd) {
>>>>> +    return (pdd && pdd->dev && pdd->dev->kfd && 
>>>>> pdd->dev->kfd->valid);
>>>>> +}
>>>>> +
>>>>>   struct kfd_process_device *kfd_process_device_data_by_id(struct 
>>>>> kfd_process *process,
>>>>>                                uint32_t gpu_id);
>>>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> index d07acf1b2f93..c06eb9d8008e 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> @@ -1157,8 +1157,6 @@ static void kfd_process_wq_release(struct 
>>>>> work_struct *work)
>>>>>       ef = rcu_access_pointer(p->ef);
>>>>>       dma_fence_signal(ef);
>>>>>   -    kfd_process_remove_sysfs(p);
>>>>> -
>>>>>       kfd_process_kunmap_signal_bo(p);
>>>>>       kfd_process_free_outstanding_kfd_bos(p);
>>>>>       svm_range_list_fini(p);
>>>>> @@ -1173,6 +1171,11 @@ static void kfd_process_wq_release(struct 
>>>>> work_struct *work)
>>>>>         put_task_struct(p->lead_thread);
>>>>>   +    /* the last step is removing process entries under /sys
>>>>> +     * to indicate the process has been terminated.
>>>>> +     */
>>>>
>>>> This comment doesn't provide any useful information. What would be 
>>>> useful is, why this needs to be the last step? Without that, I see 
>>>> no good reason for this change.
>>> ok, this change is not related to the patch. I thought it is better 
>>> to update kfd topology at last step after all kfd process resources 
>>> got released. I will remove this change.
>>>>
>>>>
>>>>> +    kfd_process_remove_sysfs(p);
>>>>> +
>>>>>       kfree(p);
>>>>>   }
>>>>>   @@ -1536,6 +1539,12 @@ static struct kfd_process 
>>>>> *create_process(const struct task_struct *thread)
>>>>>       if (err != 0)
>>>>>           goto err_init_apertures;
>>>>>   +    /* no any kfd_process_device can be created */
>>>>> +    if (!process->n_pdds) {
>>>>> +        err = -ENODEV;
>>>>> +        goto err_init_apertures;
>>>>> +    }
>>>>> +
>>>>>       /* Check XNACK support after PDDs are created in 
>>>>> kfd_init_apertures */
>>>>>       process->xnack_enabled = kfd_process_xnack_mode(process, 
>>>>> false);
>>>>>   diff --git 
>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> index c76db22a1000..eaf4ba65466c 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> @@ -124,6 +124,11 @@ int pqm_set_gws(struct process_queue_manager 
>>>>> *pqm, unsigned int qid,
>>>>>           return -EINVAL;
>>>>>       }
>>>>>   +    if (!is_kfd_process_device_valid(pdd)) {
>>>>> +        pr_debug("device 0x%x is not available\n",dev->node_id);
>>>>> +        return -EINVAL;
>>>>> +    }
>>>>> +
>>>>>       /* Only allow one queue per process can have GWS assigned */
>>>>>       if (gws && pdd->qpd.num_gws)
>>>>>           return -EBUSY;
>>>>> @@ -498,6 +503,11 @@ int pqm_destroy_queue(struct 
>>>>> process_queue_manager *pqm, unsigned int qid)
>>>>>       if (WARN_ON(!dev))
>>>>>           return -ENODEV;
>>>>>   +    if (!dev->kfd || !dev->kfd->valid) {
>>>>> +        pr_err("Process device is not valid\n");
>>>>
>>>> Would you expect to see this message during process termination 
>>>> after a hot-unplug? Should this really be an error message, or 
>>>> would an info or debug message be more appropriate?
>>> I will change kfd_get_process_device_data that will include this 
>>> check, then this message will be merged at !pdd case.
>>>>
>>>>
>>>>> +        return -1;
>>>>
>>>> This should be a proper error code. -1 is -EPERM.
>>>>
>>> Same as above.
>>>>
>>>>> +    }
>>>>> +
>>>>>       pdd = kfd_get_process_device_data(dev, pqm->process);
>>>>>       if (!pdd) {
>>>>>           pr_err("Process device data doesn't exist\n");
>>>>> @@ -567,6 +577,10 @@ int pqm_update_queue_properties(struct 
>>>>> process_queue_manager *pqm,
>>>>>           pdd = kfd_get_process_device_data(q->device, q->process);
>>>>>           if (!pdd)
>>>>>               return -ENODEV;
>>>>> +
>>>>> +        if (!is_kfd_process_device_valid(pdd))
>>>>> +            return -ENODEV;
>>>>> +         vm = drm_priv_to_vm(pdd->drm_priv);
>>>>>           err = amdgpu_bo_reserve(vm->root.bo, false);
>>>>>           if (err)
>>>>> @@ -612,6 +626,11 @@ int pqm_update_mqd(struct 
>>>>> process_queue_manager *pqm,
>>>>>           return -EFAULT;
>>>>>       }
>>>>>   +    if (!pqn->q->device->kfd->valid) {
>>>>> +        pr_debug("device where queue %d exists is not valid\n", 
>>>>> qid);
>>>>> +        return -EFAULT;
>>>>> +    }
>>>>> +
>>>>>       /* CUs are masked for debugger requirements so deny user 
>>>>> mask  */
>>>>>       if (pqn->q->properties.is_dbg_wa && minfo && 
>>>>> minfo->cu_mask.ptr)
>>>>>           return -EBUSY;
>>>>> @@ -679,6 +698,11 @@ int pqm_get_wave_state(struct 
>>>>> process_queue_manager *pqm,
>>>>>           return -EFAULT;
>>>>>       }
>>>>>   +    if (!pqn->q->device->kfd->valid) {
>>>>> +        pr_debug("device where queue %d exists is not valid\n", 
>>>>> qid);
>>>>> +        return -EFAULT;
>>>>
>>>> EFAULT means "bad address". Probably not the right error code here.
>>>
>>> Will use -EINVAL.
>>>
>>> Thanks
>>>
>>> Xiaogang
>>>
>>>>
>>>> Regards,
>>>>   Felix
>>>>
>>>>
>>>>> +    }
>>>>> +
>>>>>       return 
>>>>> pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm,
>>>>>                                  pqn->q,
>>>>>                                  ctl_stack,