<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<p>It is safe to access dqm->sched status inside dqm_lock, no
race with gpu reset.</p>
<p>Reviewed-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a><br>
</p>
<div class="moz-cite-prefix">On 2024-10-18 11:10, Shaoyun Liu wrote:<br>
</div>
<blockquote type="cite" cite="mid:20241018151005.743103-1-shaoyun.liu@amd.com">
<pre class="moz-quote-pre" wrap="">From: shaoyunl <a class="moz-txt-link-rfc2396E" href="mailto:shaoyun.liu@amd.com"><shaoyun.liu@amd.com></a>
Add back kfd queues in start scheduling that originally been
removed on stop scheduling.
Signed-off-by: Shaoyun Liu <a class="moz-txt-link-rfc2396E" href="mailto:shaoyun.liu@amd.com"><shaoyun.liu@amd.com></a>
Reviewed-by: Felix Kuehling <a class="moz-txt-link-rfc2396E" href="mailto:felix.kuehling@amd.com"><felix.kuehling@amd.com></a>
---
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 40 +++++++++++++++++--
1 file changed, 37 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index b2b16a812e73..edfb9f98555f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -202,6 +202,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
int r, queue_type;
uint64_t wptr_addr_off;
+ if (!dqm->sched_running || dqm->sched_halt)
+ return 0;
if (!down_read_trylock(&adev->reset_domain->sem))
return -EIO;
@@ -270,6 +272,8 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
int r;
struct mes_remove_queue_input queue_input;
+ if (!dqm->sched_running || dqm->sched_halt)
+ return 0;
if (!down_read_trylock(&adev->reset_domain->sem))
return -EIO;
@@ -292,7 +296,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
return r;
}
-static int remove_all_queues_mes(struct device_queue_manager *dqm)
+static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
{
struct device_process_node *cur;
struct device *dev = dqm->dev->adev->dev;
@@ -319,6 +323,33 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm)
return retval;
}
+static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
+{
+ struct device_process_node *cur;
+ struct device *dev = dqm->dev->adev->dev;
+ struct qcm_process_device *qpd;
+ struct queue *q;
+ int retval = 0;
+
+ list_for_each_entry(cur, &dqm->queues, list) {
+ qpd = cur->qpd;
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ if (!q->properties.is_active)
+ continue;
+ retval = add_queue_mes(dqm, q, qpd);
+ if (retval) {
+ dev_err(dev, "%s: Failed to add queue %d for dev %d",
+ __func__,
+ q->properties.queue_id,
+ dqm->dev->id);
+ return retval;
+ }
+ }
+ }
+
+ return retval;
+}
+
static int suspend_all_queues_mes(struct device_queue_manager *dqm)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
@@ -1742,7 +1773,7 @@ static int halt_cpsch(struct device_queue_manager *dqm)
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD, false);
else
- ret = remove_all_queues_mes(dqm);
+ ret = remove_all_kfd_queues_mes(dqm);
}
dqm->sched_halt = true;
dqm_unlock(dqm);
@@ -1768,6 +1799,9 @@ static int unhalt_cpsch(struct device_queue_manager *dqm)
ret = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
0, USE_DEFAULT_GRACE_PERIOD);
+ else
+ ret = add_all_kfd_queues_mes(dqm);
+
dqm_unlock(dqm);
return ret;
@@ -1867,7 +1901,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
if (!dqm->dev->kfd->shared_resources.enable_mes)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
else
- remove_all_queues_mes(dqm);
+ remove_all_kfd_queues_mes(dqm);
dqm->sched_running = false;
</pre>
</blockquote>
</body>
</html>