[PATCH 1/3] amd/amdkfd: add a function to wait no process running in kfd

Lazar, Lijo lijo.lazar at amd.com
Tue Mar 26 14:52:03 UTC 2024



On 3/23/2024 1:27 AM, Zhigang Luo wrote:
> Signed-off-by: Zhigang Luo <Zhigang.Luo at amd.com>
> Change-Id: I2a98d513c26107ac76ecf20e951c188afbc7ede6
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 20 ++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  5 ++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 11 +++++++++++
>  3 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index d5fde8adf19b..e02bfcec608b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -261,6 +261,26 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
>  	return r;
>  }
>  
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev)
> +{
> +	unsigned long end_jiffies;
> +
> +	if (!adev->kfd.dev)
> +		return 0;
> +
> +	end_jiffies = msecs_to_jiffies(AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS) + jiffies;
> +	while (!kgd2kfd_is_processes_table_empty(adev->kfd.dev)) {
> +		if (time_after(jiffies, end_jiffies)) {
> +			dev_err(adev->dev, "wait no process running timeout\n");
> +
> +			return -ETIME;
> +		}
> +		schedule();
> +	}
> +
> +	return 0;
> +}
> +
>  int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)
>  {
>  	int r = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index caee36e52a09..796e09abda69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -38,6 +38,8 @@
>  #include "amdgpu_vm.h"
>  #include "amdgpu_xcp.h"
>  
> +#define AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS 10000
> +
>  extern uint64_t amdgpu_amdkfd_total_mem_size;
>  
>  enum TLB_FLUSH_TYPE {
> @@ -169,7 +171,7 @@ void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle);
>  bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
>  
>  bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
> -
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev);
>  int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
>  
>  int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
> @@ -411,6 +413,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>  void kgd2kfd_device_exit(struct kfd_dev *kfd);
>  void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
>  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
> +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd);
>  int kgd2kfd_pre_reset(struct kfd_dev *kfd);
>  int kgd2kfd_post_reset(struct kfd_dev *kfd);
>  void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 041ec3de55e7..2bec79e0c721 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -894,6 +894,17 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>  	kfree(kfd);
>  }
>  
> +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd)
> +{
> +	bool is_empty;
> +
> +	mutex_lock(&kfd_processes_mutex);
> +	is_empty = hash_empty(kfd_processes_table);

This table is declared globally, but VF FLR is device specific. So you
may want to check if this is the right thing to do when there are
multiple VFs in a VM.

Thanks,
Lijo

> +	mutex_unlock(&kfd_processes_mutex);
> +
> +	return is_empty;
> +}
> +
>  int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>  {
>  	struct kfd_node *node;


More information about the amd-gfx mailing list