[PATCH 1/3] amd/amdkfd: add a function to wait no process running in kfd

Chen, Xiaogang xiaogang.chen at amd.com
Tue Mar 26 06:31:11 UTC 2024


On 3/25/2024 10:18 AM, Zhigang Luo wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Signed-off-by: Zhigang Luo<Zhigang.Luo at amd.com>
> Change-Id: I2a98d513c26107ac76ecf20e951c188afbc7ede6
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 20 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 11 +++++++++++
>   3 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index d5fde8adf19b..e02bfcec608b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -261,6 +261,26 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
>          return r;
>   }
>
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev)
> +{
> +       unsigned long end_jiffies;
> +
> +       if (!adev->kfd.dev)
> +               return 0;
> +
> +       end_jiffies = msecs_to_jiffies(AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS) + jiffies;
> +       while (!kgd2kfd_is_processes_table_empty(adev->kfd.dev)) {
> +               if (time_after(jiffies, end_jiffies)) {
> +                       dev_err(adev->dev, "wait no process running timeout\n");
> +
> +                       return -ETIME;
> +               }
> +               schedule();
> +       }
> +
> +       return 0;
> +}
> +
>   int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)
>   {
>          int r = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index caee36e52a09..d46dccc5bbf7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -38,6 +38,8 @@
>   #include "amdgpu_vm.h"
>   #include "amdgpu_xcp.h"
>
> +#define AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS 10000
> +
>   extern uint64_t amdgpu_amdkfd_total_mem_size;
>
>   enum TLB_FLUSH_TYPE {
> @@ -169,7 +171,7 @@ void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle);
>   bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
>
>   bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
> -
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev);
>   int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
>
>   int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
> @@ -411,6 +413,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>   void kgd2kfd_device_exit(struct kfd_dev *kfd);
>   void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
>   int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
> +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd);
>   int kgd2kfd_pre_reset(struct kfd_dev *kfd);
>   int kgd2kfd_post_reset(struct kfd_dev *kfd);
>   void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
> @@ -454,6 +457,11 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>          return 0;
>   }
>
> +static inline bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd)
> +{
> +       return true;
> +}
> +
>   static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>   {
>          return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 041ec3de55e7..2bec79e0c721 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -894,6 +894,17 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>          kfree(kfd);
>   }
>
> +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd)
> +{
> +       bool is_empty;
> +
> +       mutex_lock(&kfd_processes_mutex);
> +       is_empty = hash_empty(kfd_processes_table);
> +       mutex_unlock(&kfd_processes_mutex);
> +
> +       return is_empty;
> +}
> +

hash table kfd_processes_table being empty does not mean all kfd 
processes have been terminated. kfd process got terminated through a 
specific workqueue: kfd_process_wq that is async from 
kfd_processes_table getting empty. The resources and data structure that 
kfd processes use may still not released though kfd_processes_table is 
empty.

I think a solid method to know all kfd process have been terminated is 
by checking the kobject under /sys: /sys/class/kfd/kfd/proc. When this 
directory is empty we know there is no any kfd process or all kfd 
processes had been terminated.

Regard

Xiaogang

>   int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>   {
>          struct kfd_node *node;
> --
> 2.25.1
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240326/c7cc5528/attachment-0001.htm>


More information about the amd-gfx mailing list