[PATCH 1/3] amd/amdkfd: add a function to wait no process running in kfd

Luo, Zhigang Zhigang.Luo at amd.com
Mon Mar 25 20:49:27 UTC 2024


[AMD Official Use Only - General]

Hi Mukul,

The purpose of adding this function is to waiting user applications to be kill after calling amdgpu_amdkfd_pre_reset().
If I understand correctly, kfd_locked will be set after calling amdgpu_amdkfd_pre_reset(). So, I added this new function to only check if kfd_processes_table is empty.

Thanks,
Zhigang

-----Original Message-----
From: Joshi, Mukul <Mukul.Joshi at amd.com>
Sent: Monday, March 25, 2024 12:08 PM
To: Luo, Zhigang <Zhigang.Luo at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Saye, Sashank <Sashank.Saye at amd.com>; Chan, Hing Pong <Jeffrey.Chan at amd.com>; Yang, Philip <Philip.Yang at amd.com>; Lazar, Lijo <Lijo.Lazar at amd.com>; Luo, Zhigang <Zhigang.Luo at amd.com>
Subject: RE: [PATCH 1/3] amd/amdkfd: add a function to wait no process running in kfd

[AMD Official Use Only - General]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
> Zhigang Luo
> Sent: Monday, March 25, 2024 11:18 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Saye, Sashank
> <Sashank.Saye at amd.com>; Chan, Hing Pong <Jeffrey.Chan at amd.com>; Yang,
> Philip <Philip.Yang at amd.com>; Lazar, Lijo <Lijo.Lazar at amd.com>; Luo,
> Zhigang <Zhigang.Luo at amd.com>
> Subject: [PATCH 1/3] amd/amdkfd: add a function to wait no process
> running in kfd
>
> Caution: This message originated from an External Source. Use proper
> caution when opening attachments, clicking links, or responding.
>
>
> Signed-off-by: Zhigang Luo <Zhigang.Luo at amd.com>
> Change-Id: I2a98d513c26107ac76ecf20e951c188afbc7ede6
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 20
> ++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +++++++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 11 +++++++++++
>  3 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index d5fde8adf19b..e02bfcec608b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -261,6 +261,26 @@ int amdgpu_amdkfd_resume(struct amdgpu_device
> *adev, bool run_pm)
>         return r;
>  }
>
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device
> *adev) {
> +       unsigned long end_jiffies;
> +
> +       if (!adev->kfd.dev)
> +               return 0;
> +
> +       end_jiffies =
> msecs_to_jiffies(AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS) + jiffies;
> +       while (!kgd2kfd_is_processes_table_empty(adev->kfd.dev)) {
> +               if (time_after(jiffies, end_jiffies)) {
> +                       dev_err(adev->dev, "wait no process running
> + timeout\n");
> +
> +                       return -ETIME;
> +               }
> +               schedule();
> +       }
> +
> +       return 0;
> +}
> +
>  int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)  {
>         int r = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index caee36e52a09..d46dccc5bbf7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -38,6 +38,8 @@
>  #include "amdgpu_vm.h"
>  #include "amdgpu_xcp.h"
>
> +#define AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS 10000
> +
>  extern uint64_t amdgpu_amdkfd_total_mem_size;
>
>  enum TLB_FLUSH_TYPE {
> @@ -169,7 +171,7 @@ void amdgpu_amdkfd_set_compute_idle(struct
> amdgpu_device *adev, bool idle);  bool
> amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
>
>  bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
> -
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device
> *adev);
>  int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
>
>  int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev); @@ -411,6
> +413,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,  void
> kgd2kfd_device_exit(struct kfd_dev *kfd);  void kgd2kfd_suspend(struct
> kfd_dev *kfd, bool run_pm);  int kgd2kfd_resume(struct kfd_dev *kfd,
> bool run_pm);
> +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd);
>  int kgd2kfd_pre_reset(struct kfd_dev *kfd);  int
> kgd2kfd_post_reset(struct kfd_dev *kfd);  void
> kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); @@
> -454,6 +457,11 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>         return 0;
>  }
>
> +static inline bool kgd2kfd_is_processes_table_empty(struct kfd_dev
> +*kfd) {
> +       return true;
> +}
> +
>  static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)  {
>         return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 041ec3de55e7..2bec79e0c721 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -894,6 +894,17 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>         kfree(kfd);
>  }
>
> +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd) {
> +       bool is_empty;
> +
> +       mutex_lock(&kfd_processes_mutex);
> +       is_empty = hash_empty(kfd_processes_table);
> +       mutex_unlock(&kfd_processes_mutex);
> +
> +       return is_empty;
> +}
> +
We already have a function, kgd2kfd_check_and_lock_kfd(), for this, which is more robust.
Please use that one instead.

Regards,
Mukul

>  int kgd2kfd_pre_reset(struct kfd_dev *kfd)  {
>         struct kfd_node *node;
> --
> 2.25.1




More information about the amd-gfx mailing list