[PATCH 1/2] amd/amdkfd: sync all devices to wait all processes being evicted
Felix Kuehling
felix.kuehling at amd.com
Wed Apr 3 19:49:35 UTC 2024
On 2024-04-03 14:12, Zhigang Luo wrote:
> If there are more than one device doing reset in parallel, the first
> device will call kfd_suspend_all_processes() to evict all processes
> on all devices, this call takes time to finish. other device will
> start reset and recover without waiting. if the process has not been
> evicted before doing recover, it will be restored, then caused page
> fault.
>
> Signed-off-by: Zhigang Luo <Zhigang.Luo at amd.com>
This patch is
Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 17 ++++++-----------
> 1 file changed, 6 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 041ec3de55e7..719d6d365e15 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
> {
> struct kfd_node *node;
> int i;
> - int count;
>
> if (!kfd->init_complete)
> return;
> @@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
> /* for runtime suspend, skip locking kfd */
> if (!run_pm) {
> mutex_lock(&kfd_processes_mutex);
> - count = ++kfd_locked;
> - mutex_unlock(&kfd_processes_mutex);
> -
> /* For first KFD device suspend all the KFD processes */
> - if (count == 1)
> + if (++kfd_locked == 1)
> kfd_suspend_all_processes();
> + mutex_unlock(&kfd_processes_mutex);
> }
>
> for (i = 0; i < kfd->num_nodes; i++) {
> @@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>
> int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
> {
> - int ret, count, i;
> + int ret, i;
>
> if (!kfd->init_complete)
> return 0;
> @@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
> /* for runtime resume, skip unlocking kfd */
> if (!run_pm) {
> mutex_lock(&kfd_processes_mutex);
> - count = --kfd_locked;
> - mutex_unlock(&kfd_processes_mutex);
> -
> - WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
> - if (count == 0)
> + if (--kfd_locked == 0)
> ret = kfd_resume_all_processes();
> + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
> + mutex_unlock(&kfd_processes_mutex);
> }
>
> return ret;
More information about the amd-gfx
mailing list