[PATCH 2/2] drm/amdkfd: Check preemption status on all XCDs
Felix Kuehling
felix.kuehling at amd.com
Thu Mar 14 18:38:43 UTC 2024
On 2024-03-14 12:00, Mukul Joshi wrote:
> This patch adds the following functionality:
> - Check the queue preemption status on all XCDs in a partition
> for GFX 9.4.3.
> - Update the queue preemption debug message to print the queue
> doorbell id for which preemption failed.
> - Change the signature of check preemption failed function to
> return a bool instead of uint32_t and pass the MQD manager
> as an argument.
>
> Suggested-by: Jay Cornwall<jay.cornwall at amd.com>
> Signed-off-by: Mukul Joshi<mukul.joshi at amd.com>
> ---
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 3 +--
> drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 18 +++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 4 ++-
> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 4 +--
> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c | 4 +--
> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 4 +--
> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 25 ++++++++++++++++---
> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 4 +--
> 8 files changed, 52 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 1ce398ab0b3d..151fabf84040 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1997,8 +1997,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
> * check those fields
> */
> mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
> - if (mqd_mgr->check_preemption_failed(dqm->packet_mgr.priv_queue->queue->mqd)) {
> - dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n");
> + if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
> while (halt_if_hws_hang)
> schedule();
> return -ETIME;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index 050a6936ff84..cbec8c87c984 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -290,3 +290,21 @@ uint64_t kfd_mqd_stride(struct mqd_manager *mm,
> {
> return mm->mqd_size;
> }
> +
> +bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id,
> + uint32_t inst)
> +{
> + if (doorbell_id) {
> + struct device *dev = node->adev->dev;
> +
> + if (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3))
Could this be made more generic? E.g.:
if (node->adev->xcp_mgr && node->adev->xcp_mgr->num_xcps > 0)
Other than that, the series is
Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>
> + dev_err(dev, "XCC %d: Queue preemption failed for queue with doorbell_id: %x\n",
> + inst, doorbell_id);
> + else
> + dev_err(dev, "Queue preemption failed for queue with doorbell_id: %x\n",
> + doorbell_id);
> + return true;
> + }
> +
> + return false;
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> index ba3eebb2ca6d..17cc1f25c8d0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> @@ -119,7 +119,7 @@ struct mqd_manager {
> #if defined(CONFIG_DEBUG_FS)
> int (*debugfs_show_mqd)(struct seq_file *m, void *data);
> #endif
> - uint32_t (*check_preemption_failed)(void *mqd);
> + bool (*check_preemption_failed)(struct mqd_manager *mm, void *mqd);
> uint64_t (*mqd_stride)(struct mqd_manager *mm,
> struct queue_properties *p);
>
> @@ -198,4 +198,6 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev,
> uint64_t kfd_hiq_mqd_stride(struct kfd_node *dev);
> uint64_t kfd_mqd_stride(struct mqd_manager *mm,
> struct queue_properties *q);
> +bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id,
> + uint32_t inst);
> #endif /* KFD_MQD_MANAGER_H_ */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> index 8f9f56f7a8b0..05f3ac2eaef9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> @@ -206,11 +206,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd,
> q->is_active = QUEUE_IS_ACTIVE(*q);
> }
>
> -static uint32_t check_preemption_failed(void *mqd)
> +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
> {
> struct cik_mqd *m = (struct cik_mqd *)mqd;
>
> - return m->queue_doorbell_id0;
> + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
> }
>
> static void update_mqd(struct mqd_manager *mm, void *mqd,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> index d4cf7d845928..2eff37aaf827 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> @@ -224,11 +224,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
> q->is_active = QUEUE_IS_ACTIVE(*q);
> }
>
> -static uint32_t check_preemption_failed(void *mqd)
> +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
> {
> struct v10_compute_mqd *m = (struct v10_compute_mqd *)mqd;
>
> - return m->queue_doorbell_id0;
> + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
> }
>
> static int get_wave_state(struct mqd_manager *mm, void *mqd,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> index 2b9f57c267eb..68dbc0399c87 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> @@ -278,11 +278,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
> q->is_active = QUEUE_IS_ACTIVE(*q);
> }
>
> -static uint32_t check_preemption_failed(void *mqd)
> +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
> {
> struct v11_compute_mqd *m = (struct v11_compute_mqd *)mqd;
>
> - return m->queue_doorbell_id0;
> + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
> }
>
> static int get_wave_state(struct mqd_manager *mm, void *mqd,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 7c93a0932677..6bddc16808d7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -316,11 +316,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
> }
>
>
> -static uint32_t check_preemption_failed(void *mqd)
> +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
> {
> struct v9_mqd *m = (struct v9_mqd *)mqd;
>
> - return m->queue_doorbell_id0;
> + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
> }
>
> static int get_wave_state(struct mqd_manager *mm, void *mqd,
> @@ -607,6 +607,24 @@ static int destroy_hiq_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
> return err;
> }
>
> +static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd)
> +{
> + uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev);
> + uint32_t xcc_mask = mm->dev->xcc_mask;
> + int inst = 0, xcc_id;
> + struct v9_mqd *m;
> + bool ret = false;
> +
> + for_each_inst(xcc_id, xcc_mask) {
> + m = get_mqd(mqd + hiq_mqd_size * inst);
> + ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev,
> + m->queue_doorbell_id0, inst);
> + ++inst;
> + }
> +
> + return ret;
> +}
> +
> static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj,
> struct kfd_mem_obj *xcc_mqd_mem_obj,
> uint64_t offset)
> @@ -881,15 +899,16 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
> #if defined(CONFIG_DEBUG_FS)
> mqd->debugfs_show_mqd = debugfs_show_mqd;
> #endif
> - mqd->check_preemption_failed = check_preemption_failed;
> if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) {
> mqd->init_mqd = init_mqd_hiq_v9_4_3;
> mqd->load_mqd = hiq_load_mqd_kiq_v9_4_3;
> mqd->destroy_mqd = destroy_hiq_mqd_v9_4_3;
> + mqd->check_preemption_failed = check_preemption_failed_v9_4_3;
> } else {
> mqd->init_mqd = init_mqd_hiq;
> mqd->load_mqd = kfd_hiq_load_mqd_kiq;
> mqd->destroy_mqd = destroy_hiq_mqd;
> + mqd->check_preemption_failed = check_preemption_failed;
> }
> break;
> case KFD_MQD_TYPE_DIQ:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> index dbc868e0363f..c1fafc502515 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> @@ -237,11 +237,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd,
> q->is_active = QUEUE_IS_ACTIVE(*q);
> }
>
> -static uint32_t check_preemption_failed(void *mqd)
> +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
> {
> struct vi_mqd *m = (struct vi_mqd *)mqd;
>
> - return m->queue_doorbell_id0;
> + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
> }
>
> static void update_mqd(struct mqd_manager *mm, void *mqd,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240314/6e09ea65/attachment-0001.htm>
More information about the amd-gfx
mailing list