[PATCH 1/2] amd/amdkfd: sync all devices to wait all processes being evicted

Felix Kuehling felix.kuehling at amd.com
Wed Apr 3 19:49:35 UTC 2024


On 2024-04-03 14:12, Zhigang Luo wrote:
> If there are more than one device doing reset in parallel, the first
> device will call kfd_suspend_all_processes() to evict all processes
> on all devices, this call takes time to finish. other device will
> start reset and recover without waiting. if the process has not been
> evicted before doing recover, it will be restored, then caused page
> fault.
>
> Signed-off-by: Zhigang Luo <Zhigang.Luo at amd.com>

This patch is

Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c | 17 ++++++-----------
>   1 file changed, 6 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 041ec3de55e7..719d6d365e15 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>   {
>   	struct kfd_node *node;
>   	int i;
> -	int count;
>   
>   	if (!kfd->init_complete)
>   		return;
> @@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>   	/* for runtime suspend, skip locking kfd */
>   	if (!run_pm) {
>   		mutex_lock(&kfd_processes_mutex);
> -		count = ++kfd_locked;
> -		mutex_unlock(&kfd_processes_mutex);
> -
>   		/* For first KFD device suspend all the KFD processes */
> -		if (count == 1)
> +		if (++kfd_locked == 1)
>   			kfd_suspend_all_processes();
> +		mutex_unlock(&kfd_processes_mutex);
>   	}
>   
>   	for (i = 0; i < kfd->num_nodes; i++) {
> @@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>   
>   int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>   {
> -	int ret, count, i;
> +	int ret, i;
>   
>   	if (!kfd->init_complete)
>   		return 0;
> @@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>   	/* for runtime resume, skip unlocking kfd */
>   	if (!run_pm) {
>   		mutex_lock(&kfd_processes_mutex);
> -		count = --kfd_locked;
> -		mutex_unlock(&kfd_processes_mutex);
> -
> -		WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
> -		if (count == 0)
> +		if (--kfd_locked == 0)
>   			ret = kfd_resume_all_processes();
> +		WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
> +		mutex_unlock(&kfd_processes_mutex);
>   	}
>   
>   	return ret;


More information about the amd-gfx mailing list