[PATCH] drm/amd/amdkfd: add/remove kfd queues on start/stop KFD scheduling

Liu, Shaoyun Shaoyun.Liu at amd.com
Fri Oct 18 14:56:22 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

Good catch .   Thanks . I will sent out another review for that .

Regards
Shaoyun.liu

From: Yang, Philip <Philip.Yang at amd.com>
Sent: Thursday, October 17, 2024 3:47 PM
To: Liu, Shaoyun <Shaoyun.Liu at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH] drm/amd/amdkfd: add/remove kfd queues on start/stop KFD scheduling



On 2024-10-17 12:12, Shaoyun Liu wrote:

From: shaoyunl <shaoyun.liu at amd.com><mailto:shaoyun.liu at amd.com>



Add back kfd queues in start scheduling that originally been

removed on stop scheduling.



Signed-off-by: Shaoyun Liu <shaoyun.liu at amd.com><mailto:shaoyun.liu at amd.com>

---

 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 40 +++++++++++++++++--

 1 file changed, 37 insertions(+), 3 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index b2b16a812e73..542363b4712e 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

@@ -204,6 +204,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,



  if (!down_read_trylock(&adev->reset_domain->sem))

         return -EIO;

+ if (!dqm->sched_running || dqm->sched_halt) {
                    up_read(&adev->reset_domain->sem);



+        return 0;
            }






  memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));

  queue_input.process_id = qpd->pqm->process->pasid;

@@ -272,6 +274,8 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,



  if (!down_read_trylock(&adev->reset_domain->sem))

         return -EIO;

+ if (!dqm->sched_running || dqm->sched_halt) {
                     up_read(&adev->reset_domain->sem);




+        return 0;

              }

It is simpler to move sched_halt/running check outside reset sem lock, but not sure if it is safe.

Regards,

Philip





  memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));

  queue_input.doorbell_offset = q->properties.doorbell_off;

@@ -292,7 +296,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,

  return r;

 }



-static int remove_all_queues_mes(struct device_queue_manager *dqm)

+static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)

 {

  struct device_process_node *cur;

  struct device *dev = dqm->dev->adev->dev;

@@ -319,6 +323,33 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm)

  return retval;

 }



+static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)

+{

+ struct device_process_node *cur;

+ struct device *dev = dqm->dev->adev->dev;

+ struct qcm_process_device *qpd;

+ struct queue *q;

+ int retval = 0;

+

+ list_for_each_entry(cur, &dqm->queues, list) {

+        qpd = cur->qpd;

+        list_for_each_entry(q, &qpd->queues_list, list) {

+                if (!q->properties.is_active)

+                        continue;

+                retval = add_queue_mes(dqm, q, qpd);

+                if (retval) {

+                        dev_err(dev, "%s: Failed to add queue %d for dev %d",

+                               __func__,

+                          q->properties.queue_id,

+                          dqm->dev->id);

+                        return retval;

+                }

+        }

+ }

+

+ return retval;

+}

+

 static int suspend_all_queues_mes(struct device_queue_manager *dqm)

 {

  struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;

@@ -1742,7 +1773,7 @@ static int halt_cpsch(struct device_queue_manager *dqm)

                                         KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,

                   USE_DEFAULT_GRACE_PERIOD, false);

         else

-                ret = remove_all_queues_mes(dqm);

+                ret = remove_all_kfd_queues_mes(dqm);

  }

  dqm->sched_halt = true;

  dqm_unlock(dqm);

@@ -1768,6 +1799,9 @@ static int unhalt_cpsch(struct device_queue_manager *dqm)

         ret = execute_queues_cpsch(dqm,

                                   KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,

                 0, USE_DEFAULT_GRACE_PERIOD);

+ else

+        ret = add_all_kfd_queues_mes(dqm);

+

  dqm_unlock(dqm);



  return ret;

@@ -1867,7 +1901,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)

  if (!dqm->dev->kfd->shared_resources.enable_mes)

         unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);

  else

-        remove_all_queues_mes(dqm);

+        remove_all_kfd_queues_mes(dqm);



  dqm->sched_running = false;


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20241018/6ffafb9d/attachment-0001.htm>


More information about the amd-gfx mailing list