[PATCH] drm/amdkfd: fix mes based process evictions
Kasiviswanathan, Harish
Harish.Kasiviswanathan at amd.com
Tue Jun 3 21:22:20 UTC 2025
[Public]
So, the code now loops two times over the list of queues. Couple of questions.
(1) Isn't it possible to call suspend_all_queues_mes(dqm) before the first list_for_each_entry()? The first loop only does some housekeeping. That way you can still do get away with single loop.
(2) Also remove_queue_mes() is called for inactive queues (q->properties.is_active). Is that intentional?
Best Regards,
Harish
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Jonathan Kim
Sent: Tuesday, June 3, 2025 12:30 PM
To: amd-gfx at lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling at amd.com>; Joshi, Mukul <Mukul.Joshi at amd.com>; Kim, Jonathan <Jonathan.Kim at amd.com>
Subject: [PATCH] drm/amdkfd: fix mes based process evictions
First, MES suspend/resume calls should be consistently held under the
KFD device lock (similar to queue invalidation) so consolidate
MES based eviction logic with F32 HWS based eviction logic.
Second, save the last eviction timestamp prior to current eviction to
align with F32 HWS timestamp logging behaviour.
Finally, bail early on failure to remove a single queue as something
has gone really wrong post-suspend and a GPU reset is going to occur
anyway so it's more efficient to just release the device lock.
Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
---
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 73 +++++--------------
1 file changed, 20 insertions(+), 53 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 76359c6a3f3a..c1f0207eeb9e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1219,25 +1219,36 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
q->properties.is_active = false;
decrement_queue_count(dqm, qpd, q);
+ }
- if (dqm->dev->kfd->shared_resources.enable_mes) {
- int err;
+ pdd->last_evict_timestamp = get_jiffies_64();
+
+ if (dqm->dev->kfd->shared_resources.enable_mes) {
+ retval = suspend_all_queues_mes(dqm);
+ if (retval) {
+ dev_err(dev, "Suspending all queues failed");
+ goto out;
+ }
- err = remove_queue_mes(dqm, q, qpd);
- if (err) {
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ retval = remove_queue_mes(dqm, q, qpd);
+ if (retval) {
dev_err(dev, "Failed to evict queue %d\n",
q->properties.queue_id);
- retval = err;
+ goto out;
}
}
- }
- pdd->last_evict_timestamp = get_jiffies_64();
- if (!dqm->dev->kfd->shared_resources.enable_mes)
+
+ retval = resume_all_queues_mes(dqm);
+ if (retval)
+ dev_err(dev, "Resuming all queues failed");
+ } else {
retval = execute_queues_cpsch(dqm,
qpd->is_debug ?
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD);
+ }
out:
dqm_unlock(dqm);
@@ -3089,61 +3100,17 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbel
return ret;
}
-static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
- struct qcm_process_device *qpd)
-{
- struct device *dev = dqm->dev->adev->dev;
- int ret = 0;
-
- /* Check if process is already evicted */
- dqm_lock(dqm);
- if (qpd->evicted) {
- /* Increment the evicted count to make sure the
- * process stays evicted before its terminated.
- */
- qpd->evicted++;
- dqm_unlock(dqm);
- goto out;
- }
- dqm_unlock(dqm);
-
- ret = suspend_all_queues_mes(dqm);
- if (ret) {
- dev_err(dev, "Suspending all queues failed");
- goto out;
- }
-
- ret = dqm->ops.evict_process_queues(dqm, qpd);
- if (ret) {
- dev_err(dev, "Evicting process queues failed");
- goto out;
- }
-
- ret = resume_all_queues_mes(dqm);
- if (ret)
- dev_err(dev, "Resuming all queues failed");
-
-out:
- return ret;
-}
-
int kfd_evict_process_device(struct kfd_process_device *pdd)
{
struct device_queue_manager *dqm;
struct kfd_process *p;
- int ret = 0;
p = pdd->process;
dqm = pdd->dev->dqm;
WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
- if (dqm->dev->kfd->shared_resources.enable_mes)
- ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
- else
- ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
-
- return ret;
+ return dqm->ops.evict_process_queues(dqm, &pdd->qpd);
}
int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
--
2.34.1
More information about the amd-gfx
mailing list