[PATCH] drm/amdkfd: Restore all process on post reset

Eric Huang jinhuieric.huang at amd.com
Wed Jul 28 17:31:54 UTC 2021


It is to fix a bug of gpu_recovery on multiple GPUs,
When one gpu is reset, the application running on other
gpu hangs, because kfd post reset doesn't restore the
running process. And it also fixes a bug in the function
kfd_process_evict_queues, when one gpu hangs, process
running on other gpus can't be evicted.

Signed-off-by: Eric Huang <jinhuieric.huang at amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 24 +-----------------------
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 24b5e0aa1eac..daf1c19bd799 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -984,7 +984,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 	if (!kfd->init_complete)
 		return 0;
 
-	ret = kfd_resume(kfd);
+	ret = kgd2kfd_resume(kfd, false, true);
 	if (ret)
 		return ret;
 	atomic_dec(&kfd_locked);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 38a9dee40785..9272a12c1db8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1879,36 +1879,14 @@ int kfd_process_evict_queues(struct kfd_process *p)
 {
 	int r = 0;
 	int i;
-	unsigned int n_evicted = 0;
 
 	for (i = 0; i < p->n_pdds; i++) {
 		struct kfd_process_device *pdd = p->pdds[i];
 
 		r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
 							    &pdd->qpd);
-		if (r) {
+		if (r)
 			pr_err("Failed to evict process queues\n");
-			goto fail;
-		}
-		n_evicted++;
-	}
-
-	return r;
-
-fail:
-	/* To keep state consistent, roll back partial eviction by
-	 * restoring queues
-	 */
-	for (i = 0; i < p->n_pdds; i++) {
-		struct kfd_process_device *pdd = p->pdds[i];
-
-		if (n_evicted == 0)
-			break;
-		if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
-							      &pdd->qpd))
-			pr_err("Failed to restore queues\n");
-
-		n_evicted--;
 	}
 
 	return r;
-- 
2.25.1



More information about the amd-gfx mailing list