[PATCH 2/2] drm/amdkfd: enable cooperative groups for gfx11

Tue Jul 18 21:16:51 UTC 2023

Am 2023-07-14 um 05:37 schrieb Jonathan Kim:
> MES can concurrently schedule queues on the device that require
> exclusive device access if marked exclusively_scheduled without the
> requirement of GWS.  Similar to the F32 HWS, MES will manage
> quality of service for these queues.
> Use this for cooperative groups since cooperative groups are device
> occupancy limited.
>
> Since some GFX11 devices can only be debugged with partial CUs, do not
> allow the debugging of cooperative groups on these devices as the CU
> occupancy limit will change on attach.
>
> In addition, zero initialize the MES add queue submission vector for MES
> initialization tests as we do not want these to be cooperative
> dispatches.
>
> NOTE: FIXME MES FW enablement checks are a placeholder at the moment and
> will be updated when the binary revision number is finalized.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
Some nit-picks inline. Looks good to me otherwise.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c               |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h               |  1 +
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c                |  2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c              |  3 ++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c                |  3 ++-
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c               |  6 +++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c |  9 ++++-----
>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c    | 11 +++++++----
>   drivers/gpu/drm/amd/include/mes_v11_api_def.h         |  4 +++-
>   9 files changed, 27 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index e9091ebfe230..8d13623389d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -638,7 +638,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
>   {
>   	struct amdgpu_mes_queue *queue;
>   	struct amdgpu_mes_gang *gang;
> -	struct mes_add_queue_input queue_input;
> +	struct mes_add_queue_input queue_input = {0};

  Generally, it is preferred to use memset to initialize structures on 
the stack because that also initializes padding.

>   	unsigned long flags;
>   	int r;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 2d6ac30b7135..2053954a235c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -224,6 +224,7 @@ struct mes_add_queue_input {
>   	uint32_t	is_kfd_process;
>   	uint32_t	is_aql_queue;
>   	uint32_t	queue_size;
> +	uint32_t	exclusively_scheduled;
>   };
>   
>   struct mes_remove_queue_input {
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 1bdaa00c0b46..8e67e965f7ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -214,6 +214,8 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
>   	mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
>   	mes_add_queue_pkt.gds_size = input->queue_size;
>   
> +	mes_add_queue_pkt.exclusively_scheduled = input->exclusively_scheduled;
> +
>   	return mes_v11_0_submit_pkt_and_poll_completion(mes,
>   			&mes_add_queue_pkt, sizeof(mes_add_queue_pkt),
>   			offsetof(union MESAPI__ADD_QUEUE, api_status));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 40ac093b5035..e18401811956 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1487,7 +1487,8 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
>   		goto out_unlock;
>   	}
>   
> -	if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
> +	if (p->debug_trap_enabled && (!kfd_dbg_has_gws_support(dev) ||
> +				       kfd_dbg_has_cwsr_workaround(dev))) {

Indentation looks off. kfd_dbg_has_cwsr_workaround should be indented 
one less space. Otherwise you may be incorrectly implying that the ! 
applies to it.

>   		retval = -EBUSY;
>   		goto out_unlock;
>   	}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index ccfc81f085ce..895e7f690fd0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -753,7 +753,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   		if (!KFD_IS_SOC15(pdd->dev))
>   			return -ENODEV;
>   
> -		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
> +		if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
> +					  kfd_dbg_has_cwsr_workaround(pdd->dev)))

Same as above.

>   			return -EBUSY;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 0b3dc754e06b..9f4f75fd2fb2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -508,6 +508,7 @@ static int kfd_gws_init(struct kfd_node *node)
>   {
>   	int ret = 0;
>   	struct kfd_dev *kfd = node->kfd;
> +	uint32_t mes_rev = node->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
>   
>   	if (node->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
>   		return 0;
> @@ -524,7 +525,10 @@ static int kfd_gws_init(struct kfd_node *node)
>   		(KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3)) ||
>   		(KFD_GC_VERSION(node) >= IP_VERSION(10, 3, 0)
>   			&& KFD_GC_VERSION(node) < IP_VERSION(11, 0, 0)
> -			&& kfd->mec2_fw_version >= 0x6b))))
> +			&& kfd->mec2_fw_version >= 0x6b) ||
> +		(KFD_GC_VERSION(node) >= IP_VERSION(11, 0, 0)
> +			&& KFD_GC_VERSION(node) < IP_VERSION(12, 0, 0)
> +			&& mes_rev >= 68)))) /* FIXME: Placeholder version */

Can this comment be removed? You should know the correct MES version 
before submitting this patch.

>   		ret = amdgpu_amdkfd_alloc_gws(node->adev,
>   				node->adev->gds.gws_size, &node->gws);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 761963ad6154..7e8bc7328a79 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -237,10 +237,9 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	}
>   	queue_input.queue_type = (uint32_t)queue_type;
>   
> -	if (q->gws) {
> -		queue_input.gws_base = 0;
> -		queue_input.gws_size = qpd->num_gws;
> -	}
> +	queue_input.exclusively_scheduled = q->properties.is_gws;
> +	if (q->properties.is_gws && dqm->gws_queue_count > 1)
> +		pr_warn("Runlist is getting oversubscibed.  Expect reduced ROCm performance.\n");

I'm not sure this message makes sense for MES. There isn't a "runlist" 
in the same sense as with the old HWS. And the scheduling algorithm of 
MES should perform better, so maybe we don't need to warn about poor 
performance.

>   
>   	amdgpu_mes_lock(&adev->mes);
>   	r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
> @@ -250,7 +249,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   			q->properties.doorbell_off);
>   		pr_err("MES might be in unrecoverable state, issue a GPU reset\n");
>   		kfd_hws_hang(dqm);
> -}
> +	}
>   
>   	return r;
>   }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index ba9d69054119..139c6b58bb7e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -123,7 +123,7 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
>   	if (!gws && pdd->qpd.num_gws == 0)
>   		return -EINVAL;
>   
> -	if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3)) {
> +	if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && !dev->kfd->shared_resources.enable_mes) {
>   		if (gws)
>   			ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info,
>   				gws, &mem);
> @@ -136,7 +136,8 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
>   	} else {
>   		/*
>   		 * Intentionally set GWS to a non-NULL value
> -		 * for GFX 9.4.3.
> +		 * for devices that do not require GWS but require the formality
> +		 * of setting GWS for cooperative groups.

Change "devices that do not require GWS" to "devices that do not use GWS 
for global wave synchronization".

>   		 */
>   		pqn->q->gws = gws ? ERR_PTR(-ENOMEM) : NULL;
>   	}
> @@ -173,7 +174,8 @@ void pqm_uninit(struct process_queue_manager *pqm)
>   
>   	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
>   		if (pqn->q && pqn->q->gws &&
> -		    KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3))
> +		    KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> +				!pqn->q->device->kfd->shared_resources.enable_mes)

The indentation looks off here.

>   			amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
>   				pqn->q->gws);
>   		kfd_procfs_del_queue(pqn->q);
> @@ -455,7 +457,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   		}
>   
>   		if (pqn->q->gws) {
> -			if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3))
> +			if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> +					!dev->kfd->shared_resources.enable_mes)

Indentation.

Regards,
   Felix

>   				amdgpu_amdkfd_remove_gws_from_process(
>   						pqm->process->kgd_process_info,
>   						pqn->q->gws);
> diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> index 0997e999416a..b1db2b190187 100644
> --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> @@ -275,7 +275,9 @@ union MESAPI__ADD_QUEUE {
>   			uint32_t trap_en		: 1;
>   			uint32_t is_aql_queue		: 1;
>   			uint32_t skip_process_ctx_clear : 1;
> -			uint32_t reserved		: 19;
> +			uint32_t map_legacy_kq		: 1;
> +			uint32_t exclusively_scheduled	: 1;
> +			uint32_t reserved		: 17;
>   		};
>   		struct MES_API_STATUS		api_status;
>   		uint64_t                        tma_addr;