[PATCH 2/2] drm/amdkfd: enable cooperative groups for gfx11
Felix Kuehling
felix.kuehling at amd.com
Tue Jul 18 21:16:51 UTC 2023
Am 2023-07-14 um 05:37 schrieb Jonathan Kim:
> MES can concurrently schedule queues on the device that require
> exclusive device access if marked exclusively_scheduled without the
> requirement of GWS. Similar to the F32 HWS, MES will manage
> quality of service for these queues.
> Use this for cooperative groups since cooperative groups are device
> occupancy limited.
>
> Since some GFX11 devices can only be debugged with partial CUs, do not
> allow the debugging of cooperative groups on these devices as the CU
> occupancy limit will change on attach.
>
> In addition, zero initialize the MES add queue submission vector for MES
> initialization tests as we do not want these to be cooperative
> dispatches.
>
> NOTE: FIXME MES FW enablement checks are a placeholder at the moment and
> will be updated when the binary revision number is finalized.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
Some nit-picks inline. Looks good to me otherwise.
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 2 ++
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 3 ++-
> drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 3 ++-
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 6 +++++-
> drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 9 ++++-----
> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 11 +++++++----
> drivers/gpu/drm/amd/include/mes_v11_api_def.h | 4 +++-
> 9 files changed, 27 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index e9091ebfe230..8d13623389d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -638,7 +638,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
> {
> struct amdgpu_mes_queue *queue;
> struct amdgpu_mes_gang *gang;
> - struct mes_add_queue_input queue_input;
> + struct mes_add_queue_input queue_input = {0};
Generally, it is preferred to use memset to initialize structures on
the stack because that also initializes padding.
> unsigned long flags;
> int r;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 2d6ac30b7135..2053954a235c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -224,6 +224,7 @@ struct mes_add_queue_input {
> uint32_t is_kfd_process;
> uint32_t is_aql_queue;
> uint32_t queue_size;
> + uint32_t exclusively_scheduled;
> };
>
> struct mes_remove_queue_input {
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 1bdaa00c0b46..8e67e965f7ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -214,6 +214,8 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
> mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
> mes_add_queue_pkt.gds_size = input->queue_size;
>
> + mes_add_queue_pkt.exclusively_scheduled = input->exclusively_scheduled;
> +
> return mes_v11_0_submit_pkt_and_poll_completion(mes,
> &mes_add_queue_pkt, sizeof(mes_add_queue_pkt),
> offsetof(union MESAPI__ADD_QUEUE, api_status));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 40ac093b5035..e18401811956 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1487,7 +1487,8 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
> goto out_unlock;
> }
>
> - if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
> + if (p->debug_trap_enabled && (!kfd_dbg_has_gws_support(dev) ||
> + kfd_dbg_has_cwsr_workaround(dev))) {
Indentation looks off. kfd_dbg_has_cwsr_workaround should be indented
one less space. Otherwise you may be incorrectly implying that the !
applies to it.
> retval = -EBUSY;
> goto out_unlock;
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index ccfc81f085ce..895e7f690fd0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -753,7 +753,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> if (!KFD_IS_SOC15(pdd->dev))
> return -ENODEV;
>
> - if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
> + if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
> + kfd_dbg_has_cwsr_workaround(pdd->dev)))
Same as above.
> return -EBUSY;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 0b3dc754e06b..9f4f75fd2fb2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -508,6 +508,7 @@ static int kfd_gws_init(struct kfd_node *node)
> {
> int ret = 0;
> struct kfd_dev *kfd = node->kfd;
> + uint32_t mes_rev = node->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
>
> if (node->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
> return 0;
> @@ -524,7 +525,10 @@ static int kfd_gws_init(struct kfd_node *node)
> (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3)) ||
> (KFD_GC_VERSION(node) >= IP_VERSION(10, 3, 0)
> && KFD_GC_VERSION(node) < IP_VERSION(11, 0, 0)
> - && kfd->mec2_fw_version >= 0x6b))))
> + && kfd->mec2_fw_version >= 0x6b) ||
> + (KFD_GC_VERSION(node) >= IP_VERSION(11, 0, 0)
> + && KFD_GC_VERSION(node) < IP_VERSION(12, 0, 0)
> + && mes_rev >= 68)))) /* FIXME: Placeholder version */
Can this comment be removed? You should know the correct MES version
before submitting this patch.
> ret = amdgpu_amdkfd_alloc_gws(node->adev,
> node->adev->gds.gws_size, &node->gws);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 761963ad6154..7e8bc7328a79 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -237,10 +237,9 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> }
> queue_input.queue_type = (uint32_t)queue_type;
>
> - if (q->gws) {
> - queue_input.gws_base = 0;
> - queue_input.gws_size = qpd->num_gws;
> - }
> + queue_input.exclusively_scheduled = q->properties.is_gws;
> + if (q->properties.is_gws && dqm->gws_queue_count > 1)
> + pr_warn("Runlist is getting oversubscibed. Expect reduced ROCm performance.\n");
I'm not sure this message makes sense for MES. There isn't a "runlist"
in the same sense as with the old HWS. And the scheduling algorithm of
MES should perform better, so maybe we don't need to warn about poor
performance.
>
> amdgpu_mes_lock(&adev->mes);
> r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
> @@ -250,7 +249,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> q->properties.doorbell_off);
> pr_err("MES might be in unrecoverable state, issue a GPU reset\n");
> kfd_hws_hang(dqm);
> -}
> + }
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index ba9d69054119..139c6b58bb7e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -123,7 +123,7 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
> if (!gws && pdd->qpd.num_gws == 0)
> return -EINVAL;
>
> - if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3)) {
> + if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && !dev->kfd->shared_resources.enable_mes) {
> if (gws)
> ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info,
> gws, &mem);
> @@ -136,7 +136,8 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
> } else {
> /*
> * Intentionally set GWS to a non-NULL value
> - * for GFX 9.4.3.
> + * for devices that do not require GWS but require the formality
> + * of setting GWS for cooperative groups.
Change "devices that do not require GWS" to "devices that do not use GWS
for global wave synchronization".
> */
> pqn->q->gws = gws ? ERR_PTR(-ENOMEM) : NULL;
> }
> @@ -173,7 +174,8 @@ void pqm_uninit(struct process_queue_manager *pqm)
>
> list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
> if (pqn->q && pqn->q->gws &&
> - KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3))
> + KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> + !pqn->q->device->kfd->shared_resources.enable_mes)
The indentation looks off here.
> amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
> pqn->q->gws);
> kfd_procfs_del_queue(pqn->q);
> @@ -455,7 +457,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
> }
>
> if (pqn->q->gws) {
> - if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3))
> + if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> + !dev->kfd->shared_resources.enable_mes)
Indentation.
Regards,
Felix
> amdgpu_amdkfd_remove_gws_from_process(
> pqm->process->kgd_process_info,
> pqn->q->gws);
> diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> index 0997e999416a..b1db2b190187 100644
> --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> @@ -275,7 +275,9 @@ union MESAPI__ADD_QUEUE {
> uint32_t trap_en : 1;
> uint32_t is_aql_queue : 1;
> uint32_t skip_process_ctx_clear : 1;
> - uint32_t reserved : 19;
> + uint32_t map_legacy_kq : 1;
> + uint32_t exclusively_scheduled : 1;
> + uint32_t reserved : 17;
> };
> struct MES_API_STATUS api_status;
> uint64_t tma_addr;
More information about the amd-gfx
mailing list