[PATCH] drm/amdkfd: enable cooperative groups for gfx11

Felix Kuehling felix.kuehling at amd.com
Wed Jul 19 16:25:47 UTC 2023


Am 2023-07-19 um 10:36 schrieb Jonathan Kim:
> MES can concurrently schedule queues on the device that require
> exclusive device access if marked exclusively_scheduled without the
> requirement of GWS.  Similar to the F32 HWS, MES will manage
> quality of service for these queues.
> Use this for cooperative groups since cooperative groups are device
> occupancy limited.
>
> Since some GFX11 devices can only be debugged with partial CUs, do not
> allow the debugging of cooperative groups on these devices as the CU
> occupancy limit will change on attach.
>
> In addition, zero initialize the MES add queue submission vector for MES
> initialization tests as we do not want these to be cooperative
> dispatches.
>
> v2: fix up indentation and comments.
> remove unnecessary perf warning on oversubscription.
> change 0 init to 0 memset to deal with padding.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>

Sorry. More indentation nit-picks inline. With those fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c              |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h              |  1 +
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c               |  2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c             |  3 ++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c               |  3 ++-
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c              |  6 +++++-
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    |  7 ++-----
>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   | 12 ++++++++----
>   drivers/gpu/drm/amd/include/mes_v11_api_def.h        |  4 +++-
>   9 files changed, 27 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index f808841310fd..72ab6a838bb6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -642,6 +642,8 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
>   	unsigned long flags;
>   	int r;
>   
> +	memset(&queue_input, 0, sizeof(struct mes_add_queue_input));
> +
>   	/* allocate the mes queue buffer */
>   	queue = kzalloc(sizeof(struct amdgpu_mes_queue), GFP_KERNEL);
>   	if (!queue) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 2d6ac30b7135..2053954a235c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -224,6 +224,7 @@ struct mes_add_queue_input {
>   	uint32_t	is_kfd_process;
>   	uint32_t	is_aql_queue;
>   	uint32_t	queue_size;
> +	uint32_t	exclusively_scheduled;
>   };
>   
>   struct mes_remove_queue_input {
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 1bdaa00c0b46..8e67e965f7ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -214,6 +214,8 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
>   	mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
>   	mes_add_queue_pkt.gds_size = input->queue_size;
>   
> +	mes_add_queue_pkt.exclusively_scheduled = input->exclusively_scheduled;
> +
>   	return mes_v11_0_submit_pkt_and_poll_completion(mes,
>   			&mes_add_queue_pkt, sizeof(mes_add_queue_pkt),
>   			offsetof(union MESAPI__ADD_QUEUE, api_status));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 40ac093b5035..e0f9cf6dd8fd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1487,7 +1487,8 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
>   		goto out_unlock;
>   	}
>   
> -	if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
> +	if (p->debug_trap_enabled && (!kfd_dbg_has_gws_support(dev) ||
> +				      kfd_dbg_has_cwsr_workaround(dev))) {
>   		retval = -EBUSY;
>   		goto out_unlock;
>   	}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index ccfc81f085ce..1f82caea59ba 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -753,7 +753,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   		if (!KFD_IS_SOC15(pdd->dev))
>   			return -ENODEV;
>   
> -		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
> +		if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
> +					 kfd_dbg_has_cwsr_workaround(pdd->dev)))
>   			return -EBUSY;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 0b3dc754e06b..ebc9674d3ce1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -508,6 +508,7 @@ static int kfd_gws_init(struct kfd_node *node)
>   {
>   	int ret = 0;
>   	struct kfd_dev *kfd = node->kfd;
> +	uint32_t mes_rev = node->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
>   
>   	if (node->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
>   		return 0;
> @@ -524,7 +525,10 @@ static int kfd_gws_init(struct kfd_node *node)
>   		(KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3)) ||
>   		(KFD_GC_VERSION(node) >= IP_VERSION(10, 3, 0)
>   			&& KFD_GC_VERSION(node) < IP_VERSION(11, 0, 0)
> -			&& kfd->mec2_fw_version >= 0x6b))))
> +			&& kfd->mec2_fw_version >= 0x6b) ||
> +		(KFD_GC_VERSION(node) >= IP_VERSION(11, 0, 0)
> +			&& KFD_GC_VERSION(node) < IP_VERSION(12, 0, 0)
> +			&& mes_rev >= 68))))
>   		ret = amdgpu_amdkfd_alloc_gws(node->adev,
>   				node->adev->gds.gws_size, &node->gws);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 761963ad6154..71b7f16c0173 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -237,10 +237,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	}
>   	queue_input.queue_type = (uint32_t)queue_type;
>   
> -	if (q->gws) {
> -		queue_input.gws_base = 0;
> -		queue_input.gws_size = qpd->num_gws;
> -	}
> +	queue_input.exclusively_scheduled = q->properties.is_gws;
>   
>   	amdgpu_mes_lock(&adev->mes);
>   	r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
> @@ -250,7 +247,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   			q->properties.doorbell_off);
>   		pr_err("MES might be in unrecoverable state, issue a GPU reset\n");
>   		kfd_hws_hang(dqm);
> -}
> +	}
>   
>   	return r;
>   }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index ba9d69054119..e8ee52d70a19 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -123,7 +123,7 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
>   	if (!gws && pdd->qpd.num_gws == 0)
>   		return -EINVAL;
>   
> -	if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3)) {
> +	if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && !dev->kfd->shared_resources.enable_mes) {
>   		if (gws)
>   			ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info,
>   				gws, &mem);
> @@ -136,7 +136,9 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
>   	} else {
>   		/*
>   		 * Intentionally set GWS to a non-NULL value
> -		 * for GFX 9.4.3.
> +		 * for devices that do not use GWS for global wave
> +		 * synchronization but require the formality
> +		 * of setting GWS for cooperative groups.
>   		 */
>   		pqn->q->gws = gws ? ERR_PTR(-ENOMEM) : NULL;
>   	}
> @@ -173,7 +175,8 @@ void pqm_uninit(struct process_queue_manager *pqm)
>   
>   	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
>   		if (pqn->q && pqn->q->gws &&
> -		    KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3))
> +				KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> +					!pqn->q->device->kfd->shared_resources.enable_mes)

This still looks wrong. You have pieces of the same condition indented 3 
different ways here. I'd prefer to see this (and hoping Thunderbird 
won't mess it up):

  		if (pqn->q && pqn->q->gws &&
		    KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
		    !pqn->q->device->kfd->shared_resources.enable_mes)

Everything indented to line up just behind the open ( on the first line.


>   			amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
>   				pqn->q->gws);
>   		kfd_procfs_del_queue(pqn->q);
> @@ -455,7 +458,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   		}
>   
>   		if (pqn->q->gws) {
> -			if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3))
> +			if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> +						!dev->kfd->shared_resources.enable_mes)

Same as above.

Thanks,
   Felix


>   				amdgpu_amdkfd_remove_gws_from_process(
>   						pqm->process->kgd_process_info,
>   						pqn->q->gws);
> diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> index 0997e999416a..b1db2b190187 100644
> --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> @@ -275,7 +275,9 @@ union MESAPI__ADD_QUEUE {
>   			uint32_t trap_en		: 1;
>   			uint32_t is_aql_queue		: 1;
>   			uint32_t skip_process_ctx_clear : 1;
> -			uint32_t reserved		: 19;
> +			uint32_t map_legacy_kq		: 1;
> +			uint32_t exclusively_scheduled	: 1;
> +			uint32_t reserved		: 17;
>   		};
>   		struct MES_API_STATUS		api_status;
>   		uint64_t                        tma_addr;


More information about the amd-gfx mailing list