[PATCH] drm/amdkfd: allow users to target recommended SDMA engines

Felix Kuehling felix.kuehling at amd.com
Wed Jul 24 18:42:42 UTC 2024


On 2024-07-24 13:56, Jonathan Kim wrote:
> Certain GPUs have better copy performance over xGMI on specific
> SDMA engines depending on the source and destination GPU.
> Allow users to create SDMA queues on these recommended engines.
> Close to 2x overall performance has been observed with this
> optimization.
>
> v2: remove unnecessary crat updates and refactor sdma resource
> bit setting logic.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>

Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 16 ++++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 38 +++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +-
>   .../amd/amdkfd/kfd_process_queue_manager.c    |  1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.c     | 52 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.h     |  1 +
>   include/uapi/linux/kfd_ioctl.h                |  6 ++-
>   7 files changed, 116 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 32e5db509560..9610cb90a47e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -255,6 +255,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
>   			args->ctx_save_restore_address;
>   	q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
>   	q_properties->ctl_stack_size = args->ctl_stack_size;
> +	q_properties->sdma_engine_id = args->sdma_engine_id;
>   	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
>   		args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
>   		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
> @@ -262,6 +263,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
>   		q_properties->type = KFD_QUEUE_TYPE_SDMA;
>   	else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
>   		q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
> +	else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID)
> +		q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID;
>   	else
>   		return -ENOTSUPP;
>   
> @@ -334,6 +337,18 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>   		goto err_bind_process;
>   	}
>   
> +	if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
> +		int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) +
> +				      kfd_get_num_xgmi_sdma_engines(dev) - 1;
> +
> +		if (q_properties.sdma_engine_id > max_sdma_eng_id) {
> +			err = -EINVAL;
> +			pr_err("sdma_engine_id %i exceeds maximum id of %i\n",
> +			       q_properties.sdma_engine_id, max_sdma_eng_id);
> +			goto err_sdma_engine_id;
> +		}
> +	}
> +
>   	if (!pdd->qpd.proc_doorbells) {
>   		err = kfd_alloc_process_doorbells(dev->kfd, pdd);
>   		if (err) {
> @@ -425,6 +440,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>   	if (wptr_bo)
>   		amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
>   err_wptr_map_gart:
> +err_sdma_engine_id:
>   err_bind_process:
>   err_pdd:
>   	mutex_unlock(&p->mutex);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 4f48507418d2..69315885519d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1534,6 +1534,41 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   			q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
>   		q->properties.sdma_queue_id = q->sdma_id /
>   			kfd_get_num_xgmi_sdma_engines(dqm->dev);
> +	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
> +		int i, num_queues, num_engines, eng_offset = 0, start_engine;
> +		bool free_bit_found = false, is_xgmi = false;
> +
> +		if (q->properties.sdma_engine_id < kfd_get_num_sdma_engines(dqm->dev)) {
> +			num_queues = get_num_sdma_queues(dqm);
> +			num_engines = kfd_get_num_sdma_engines(dqm->dev);
> +			q->properties.type = KFD_QUEUE_TYPE_SDMA;
> +		} else {
> +			num_queues = get_num_xgmi_sdma_queues(dqm);
> +			num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev);
> +			eng_offset = kfd_get_num_sdma_engines(dqm->dev);
> +			q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI;
> +			is_xgmi = true;
> +		}
> +
> +		/* Scan available bit based on target engine ID. */
> +		start_engine = q->properties.sdma_engine_id - eng_offset;
> +		for (i = start_engine; i < num_queues; i += num_engines) {
> +
> +			if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap))
> +				continue;
> +
> +			clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap);
> +			q->sdma_id = i;
> +			q->properties.sdma_queue_id = q->sdma_id / num_engines;
> +			free_bit_found = true;
> +			break;
> +		}
> +
> +		if (!free_bit_found) {
> +			dev_err(dev, "No more SDMA queue to allocate for target ID %i\n",
> +				q->properties.sdma_engine_id);
> +			return -ENOMEM;
> +		}
>   	}
>   
>   	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> @@ -1786,7 +1821,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   	}
>   
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> -		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> +		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI ||
> +		q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
>   		dqm_lock(dqm);
>   		retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id : NULL);
>   		dqm_unlock(dqm);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 2b3ec92981e8..7d26e71dfd04 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -414,13 +414,16 @@ enum kfd_unmap_queues_filter {
>    * @KFD_QUEUE_TYPE_DIQ: DIQ queue type.
>    *
>    * @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI interface.
> + *
> + * @KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:  SDMA user mode queue with target SDMA engine ID.
>    */
>   enum kfd_queue_type  {
>   	KFD_QUEUE_TYPE_COMPUTE,
>   	KFD_QUEUE_TYPE_SDMA,
>   	KFD_QUEUE_TYPE_HIQ,
>   	KFD_QUEUE_TYPE_DIQ,
> -	KFD_QUEUE_TYPE_SDMA_XGMI
> +	KFD_QUEUE_TYPE_SDMA_XGMI,
> +	KFD_QUEUE_TYPE_SDMA_BY_ENG_ID
>   };
>   
>   enum kfd_queue_format {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 21f5a1fb3bf8..8adf20760e67 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -345,6 +345,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>   	switch (type) {
>   	case KFD_QUEUE_TYPE_SDMA:
>   	case KFD_QUEUE_TYPE_SDMA_XGMI:
> +	case KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:
>   		/* SDMA queues are always allocated statically no matter
>   		 * which scheduler mode is used. We also do not need to
>   		 * check whether a SDMA queue can be allocated here, because
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 6f89b06f89d3..f6effaabd4b0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -292,6 +292,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr,
>   			      iolink->max_bandwidth);
>   	sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size",
>   			      iolink->rec_transfer_size);
> +	sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask",
> +			      iolink->rec_sdma_eng_id_mask);
>   	sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags);
>   
>   	return offs;
> @@ -1265,6 +1267,55 @@ static void kfd_set_iolink_non_coherent(struct kfd_topology_device *to_dev,
>   	}
>   }
>   
> +#define REC_SDMA_NUM_GPU	8
> +static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = {
> +							{ -1, 14, 12, 2, 4, 8, 10, 6 },
> +							{ 14, -1, 2, 10, 8, 4, 6, 12 },
> +							{ 10, 2, -1, 12, 14, 6, 4, 8 },
> +							{ 2, 12, 10, -1, 6, 14, 8, 4 },
> +							{ 4, 8, 14, 6, -1, 10, 12, 2 },
> +							{ 8, 4, 6, 14, 12, -1, 2, 10 },
> +							{ 10, 6, 4, 8, 12, 2, -1, 14 },
> +							{ 6, 12, 8, 4, 2, 10, 14, -1 }};
> +
> +static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev,
> +					     struct kfd_iolink_properties *outbound_link,
> +					     struct kfd_iolink_properties *inbound_link)
> +{
> +	struct kfd_node *gpu = outbound_link->gpu;
> +	struct amdgpu_device *adev = gpu->adev;
> +	int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes;
> +	bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu &&
> +		adev->aid_mask && num_xgmi_nodes &&
> +		(amdgpu_xcp_query_partition_mode(adev->xcp_mgr, AMDGPU_XCP_FL_NONE) ==
> +		      AMDGPU_SPX_PARTITION_MODE) &&
> +		(!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8);
> +
> +	if (support_rec_eng) {
> +		int src_socket_id = adev->gmc.xgmi.physical_node_id;
> +		int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id;
> +
> +		outbound_link->rec_sdma_eng_id_mask =
> +			1 << rec_sdma_eng_map[src_socket_id][dst_socket_id];
> +		inbound_link->rec_sdma_eng_id_mask =
> +			1 << rec_sdma_eng_map[dst_socket_id][src_socket_id];
> +	} else {
> +		int num_sdma_eng = kfd_get_num_sdma_engines(gpu);
> +		int i, eng_offset = 0;
> +
> +		if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI &&
> +		    kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) {
> +			eng_offset = num_sdma_eng;
> +			num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu);
> +		}
> +
> +		for (i = 0; i < num_sdma_eng; i++) {
> +			outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
> +			inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
> +		}
> +	}
> +}
> +
>   static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
>   {
>   	struct kfd_iolink_properties *link, *inbound_link;
> @@ -1303,6 +1354,7 @@ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
>   			inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED;
>   			kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link);
>   			kfd_set_iolink_non_coherent(peer_dev, link, inbound_link);
> +			kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link);
>   		}
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> index 2d1c9d771bef..43ba67890f2c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> @@ -121,6 +121,7 @@ struct kfd_iolink_properties {
>   	uint32_t		min_bandwidth;
>   	uint32_t		max_bandwidth;
>   	uint32_t		rec_transfer_size;
> +	uint32_t		rec_sdma_eng_id_mask;
>   	uint32_t		flags;
>   	struct kfd_node		*gpu;
>   	struct kobject		*kobj;
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 285a36601dc9..71a7ce5f2d4c 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -42,9 +42,10 @@
>    * - 1.14 - Update kfd_event_data
>    * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
>    * - 1.16 - Add contiguous VRAM allocation flag
> + * - 1.17 - Add SDMA queue creation with target SDMA engine ID
>    */
>   #define KFD_IOCTL_MAJOR_VERSION 1
> -#define KFD_IOCTL_MINOR_VERSION 16
> +#define KFD_IOCTL_MINOR_VERSION 17
>   
>   struct kfd_ioctl_get_version_args {
>   	__u32 major_version;	/* from KFD */
> @@ -56,6 +57,7 @@ struct kfd_ioctl_get_version_args {
>   #define KFD_IOC_QUEUE_TYPE_SDMA			0x1
>   #define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL		0x2
>   #define KFD_IOC_QUEUE_TYPE_SDMA_XGMI		0x3
> +#define KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID	0x4
>   
>   #define KFD_MAX_QUEUE_PERCENTAGE	100
>   #define KFD_MAX_QUEUE_PRIORITY		15
> @@ -78,6 +80,8 @@ struct kfd_ioctl_create_queue_args {
>   	__u64 ctx_save_restore_address; /* to KFD */
>   	__u32 ctx_save_restore_size;	/* to KFD */
>   	__u32 ctl_stack_size;		/* to KFD */
> +	__u32 sdma_engine_id;		/* to KFD */
> +	__u32 pad;
>   };
>   
>   struct kfd_ioctl_destroy_queue_args {


More information about the amd-gfx mailing list