[PATCH 2/2] drm/amdkfd: Check preemption status on all XCDs

Joshi, Mukul Mukul.Joshi at amd.com
Thu Mar 14 19:28:18 UTC 2024


[AMD Official Use Only - General]



From: Kuehling, Felix <Felix.Kuehling at amd.com>
Sent: Thursday, March 14, 2024 2:39 PM
To: Joshi, Mukul <Mukul.Joshi at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Cornwall, Jay <Jay.Cornwall at amd.com>
Subject: Re: [PATCH 2/2] drm/amdkfd: Check preemption status on all XCDs

On 2024-03-14 12:00, Mukul Joshi wrote:

This patch adds the following functionality:

- Check the queue preemption status on all XCDs in a partition

  for GFX 9.4.3.

- Update the queue preemption debug message to print the queue

  doorbell id for which preemption failed.

- Change the signature of check preemption failed function to

  return a bool instead of uint32_t and pass the MQD manager

  as an argument.



Suggested-by: Jay Cornwall <jay.cornwall at amd.com><mailto:jay.cornwall at amd.com>

Signed-off-by: Mukul Joshi <mukul.joshi at amd.com><mailto:mukul.joshi at amd.com>

---

 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  3 +--

 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 18 +++++++++++++

 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h  |  4 ++-

 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  4 +--

 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  4 +--

 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  4 +--

 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   | 25 ++++++++++++++++---

 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  4 +--

 8 files changed, 52 insertions(+), 14 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index 1ce398ab0b3d..151fabf84040 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

@@ -1997,8 +1997,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,

         * check those fields

         */

        mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];

-       if (mqd_mgr->check_preemption_failed(dqm->packet_mgr.priv_queue->queue->mqd)) {

-               dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n");

+       if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {

                while (halt_if_hws_hang)

                        schedule();

                return -ETIME;

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c

index 050a6936ff84..cbec8c87c984 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c

@@ -290,3 +290,21 @@ uint64_t kfd_mqd_stride(struct mqd_manager *mm,

 {

        return mm->mqd_size;

 }

+

+bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id,

+                                 uint32_t inst)

+{

+       if (doorbell_id) {

+               struct device *dev = node->adev->dev;

+

+               if (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3))

Could this be made more generic? E.g.:

       if (node->adev->xcp_mgr && node->adev->xcp_mgr->num_xcps > 0)



Yes this seems much better. I will make this change before submitting.



Thanks,

Mukul



Other than that, the series is

Reviewed-by: Felix Kuehling <felix.kuehling at amd.com><mailto:felix.kuehling at amd.com>





+                       dev_err(dev, "XCC %d: Queue preemption failed for queue with doorbell_id: %x\n",

+                                                     inst, doorbell_id);

+               else

+                       dev_err(dev, "Queue preemption failed for queue with doorbell_id: %x\n",

+                                                     doorbell_id);

+               return true;

+       }

+

+       return false;

+}

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h

index ba3eebb2ca6d..17cc1f25c8d0 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h

@@ -119,7 +119,7 @@ struct mqd_manager {

 #if defined(CONFIG_DEBUG_FS)

        int     (*debugfs_show_mqd)(struct seq_file *m, void *data);

 #endif

-       uint32_t (*check_preemption_failed)(void *mqd);

+       bool (*check_preemption_failed)(struct mqd_manager *mm, void *mqd);

        uint64_t (*mqd_stride)(struct mqd_manager *mm,

                               struct queue_properties *p);



@@ -198,4 +198,6 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev,

 uint64_t kfd_hiq_mqd_stride(struct kfd_node *dev);

 uint64_t kfd_mqd_stride(struct mqd_manager *mm,

                        struct queue_properties *q);

+bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id,

+                                 uint32_t inst);

 #endif /* KFD_MQD_MANAGER_H_ */

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c

index 8f9f56f7a8b0..05f3ac2eaef9 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c

@@ -206,11 +206,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd,

        q->is_active = QUEUE_IS_ACTIVE(*q);

 }



-static uint32_t check_preemption_failed(void *mqd)

+static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)

 {

        struct cik_mqd *m = (struct cik_mqd *)mqd;



-       return m->queue_doorbell_id0;

+       return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);

 }



 static void update_mqd(struct mqd_manager *mm, void *mqd,

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c

index d4cf7d845928..2eff37aaf827 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c

@@ -224,11 +224,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,

        q->is_active = QUEUE_IS_ACTIVE(*q);

 }



-static uint32_t check_preemption_failed(void *mqd)

+static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)

 {

        struct v10_compute_mqd *m = (struct v10_compute_mqd *)mqd;



-       return m->queue_doorbell_id0;

+       return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);

 }



 static int get_wave_state(struct mqd_manager *mm, void *mqd,

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c

index 2b9f57c267eb..68dbc0399c87 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c

@@ -278,11 +278,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,

        q->is_active = QUEUE_IS_ACTIVE(*q);

 }



-static uint32_t check_preemption_failed(void *mqd)

+static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)

 {

        struct v11_compute_mqd *m = (struct v11_compute_mqd *)mqd;



-       return m->queue_doorbell_id0;

+       return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);

 }



 static int get_wave_state(struct mqd_manager *mm, void *mqd,

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c

index 7c93a0932677..6bddc16808d7 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c

@@ -316,11 +316,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,

 }





-static uint32_t check_preemption_failed(void *mqd)

+static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)

 {

        struct v9_mqd *m = (struct v9_mqd *)mqd;



-       return m->queue_doorbell_id0;

+       return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);

 }



 static int get_wave_state(struct mqd_manager *mm, void *mqd,

@@ -607,6 +607,24 @@ static int destroy_hiq_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,

        return err;

 }



+static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd)

+{

+       uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev);

+       uint32_t xcc_mask = mm->dev->xcc_mask;

+       int inst = 0, xcc_id;

+       struct v9_mqd *m;

+       bool ret = false;

+

+       for_each_inst(xcc_id, xcc_mask) {

+               m = get_mqd(mqd + hiq_mqd_size * inst);

+               ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev,

+                                      m->queue_doorbell_id0, inst);

+               ++inst;

+       }

+

+       return ret;

+}

+

 static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj,

                               struct kfd_mem_obj *xcc_mqd_mem_obj,

                               uint64_t offset)

@@ -881,15 +899,16 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,

 #if defined(CONFIG_DEBUG_FS)

                mqd->debugfs_show_mqd = debugfs_show_mqd;

 #endif

-               mqd->check_preemption_failed = check_preemption_failed;

                if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) {

                        mqd->init_mqd = init_mqd_hiq_v9_4_3;

                        mqd->load_mqd = hiq_load_mqd_kiq_v9_4_3;

                        mqd->destroy_mqd = destroy_hiq_mqd_v9_4_3;

+                       mqd->check_preemption_failed = check_preemption_failed_v9_4_3;

                } else {

                        mqd->init_mqd = init_mqd_hiq;

                        mqd->load_mqd = kfd_hiq_load_mqd_kiq;

                        mqd->destroy_mqd = destroy_hiq_mqd;

+                       mqd->check_preemption_failed = check_preemption_failed;

                }

                break;

        case KFD_MQD_TYPE_DIQ:

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c

index dbc868e0363f..c1fafc502515 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c

@@ -237,11 +237,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd,

        q->is_active = QUEUE_IS_ACTIVE(*q);

 }



-static uint32_t check_preemption_failed(void *mqd)

+static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)

 {

        struct vi_mqd *m = (struct vi_mqd *)mqd;



-       return m->queue_doorbell_id0;

+       return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);

 }



 static void update_mqd(struct mqd_manager *mm, void *mqd,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240314/43122eec/attachment-0001.htm>


More information about the amd-gfx mailing list