[PATCH 1/3] amd/amdkfd: add a function to wait no process running in kfd
Chen, Xiaogang
xiaogang.chen at amd.com
Tue Mar 26 06:31:11 UTC 2024
On 3/25/2024 10:18 AM, Zhigang Luo wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Signed-off-by: Zhigang Luo<Zhigang.Luo at amd.com>
> Change-Id: I2a98d513c26107ac76ecf20e951c188afbc7ede6
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 20 ++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +++++++++-
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 +++++++++++
> 3 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index d5fde8adf19b..e02bfcec608b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -261,6 +261,26 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
> return r;
> }
>
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev)
> +{
> + unsigned long end_jiffies;
> +
> + if (!adev->kfd.dev)
> + return 0;
> +
> + end_jiffies = msecs_to_jiffies(AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS) + jiffies;
> + while (!kgd2kfd_is_processes_table_empty(adev->kfd.dev)) {
> + if (time_after(jiffies, end_jiffies)) {
> + dev_err(adev->dev, "wait no process running timeout\n");
> +
> + return -ETIME;
> + }
> + schedule();
> + }
> +
> + return 0;
> +}
> +
> int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)
> {
> int r = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index caee36e52a09..d46dccc5bbf7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -38,6 +38,8 @@
> #include "amdgpu_vm.h"
> #include "amdgpu_xcp.h"
>
> +#define AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS 10000
> +
> extern uint64_t amdgpu_amdkfd_total_mem_size;
>
> enum TLB_FLUSH_TYPE {
> @@ -169,7 +171,7 @@ void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle);
> bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
>
> bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
> -
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev);
> int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
>
> int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
> @@ -411,6 +413,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
> void kgd2kfd_device_exit(struct kfd_dev *kfd);
> void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
> int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
> +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd);
> int kgd2kfd_pre_reset(struct kfd_dev *kfd);
> int kgd2kfd_post_reset(struct kfd_dev *kfd);
> void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
> @@ -454,6 +457,11 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
> return 0;
> }
>
> +static inline bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd)
> +{
> + return true;
> +}
> +
> static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
> {
> return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 041ec3de55e7..2bec79e0c721 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -894,6 +894,17 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
> kfree(kfd);
> }
>
> +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd)
> +{
> + bool is_empty;
> +
> + mutex_lock(&kfd_processes_mutex);
> + is_empty = hash_empty(kfd_processes_table);
> + mutex_unlock(&kfd_processes_mutex);
> +
> + return is_empty;
> +}
> +
hash table kfd_processes_table being empty does not mean all kfd
processes have been terminated. kfd process got terminated through a
specific workqueue: kfd_process_wq that is async from
kfd_processes_table getting empty. The resources and data structure that
kfd processes use may still not released though kfd_processes_table is
empty.
I think a solid method to know all kfd process have been terminated is
by checking the kobject under /sys: /sys/class/kfd/kfd/proc. When this
directory is empty we know there is no any kfd process or all kfd
processes had been terminated.
Regard
Xiaogang
> int kgd2kfd_pre_reset(struct kfd_dev *kfd)
> {
> struct kfd_node *node;
> --
> 2.25.1
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240326/c7cc5528/attachment-0001.htm>
More information about the amd-gfx
mailing list