[PATCH] drm/amdkfd: Fix a race between queue destroy and process termination

xinhui pan xinhui.pan at amd.com
Tue Jun 15 08:02:33 UTC 2021


We call free_mqd without dqm lock hold, that causes double free of
mqd_mem_obj. Fix it by using a tmp pointer.
We need walk through the queues_list with dqm lock hold. Otherwise hit
list corruption.

Signed-off-by: xinhui pan <xinhui.pan at amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c   | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e6366b408420..575c895fc241 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1484,6 +1484,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 	struct mqd_manager *mqd_mgr;
 	uint64_t sdma_val = 0;
 	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
+	void *ptr = NULL;
 
 	/* Get the SDMA queue stats */
 	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
@@ -1543,10 +1544,12 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 	pr_debug("Total of %d queues are accountable so far\n",
 			dqm->total_queue_count);
 
+	swap(ptr, q->mqd_mem_obj);
 	dqm_unlock(dqm);
 
 	/* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */
-	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+	if (ptr)
+		mqd_mgr->free_mqd(mqd_mgr, q->mqd, ptr);
 
 	return retval;
 
@@ -1709,6 +1712,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	enum kfd_unmap_queues_filter filter =
 		KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
 	bool found = false;
+	void *ptr = NULL;
 
 	retval = 0;
 
@@ -1737,8 +1741,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 				qpd->mapped_gws_queue = false;
 			}
 		}
-
-		dqm->total_queue_count--;
 	}
 
 	/* Unregister process */
@@ -1770,13 +1772,20 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	/* Lastly, free mqd resources.
 	 * Do free_mqd() after dqm_unlock to avoid circular locking.
 	 */
+	dqm_lock(dqm);
 	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
 		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
 				q->properties.type)];
 		list_del(&q->list);
 		qpd->queue_count--;
-		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+		dqm->total_queue_count--;
+		swap(ptr, q->mqd_mem_obj);
+		dqm_unlock(dqm);
+		if (ptr)
+			mqd_mgr->free_mqd(mqd_mgr, q->mqd, ptr);
+		dqm_lock(dqm);
 	}
+	dqm_unlock(dqm);
 
 	return retval;
 }
-- 
2.25.1



More information about the amd-gfx mailing list