[PATCH] drm/amdkfd: allow users to target recommended SDMA engines
Felix Kuehling
felix.kuehling at amd.com
Wed Jul 24 18:42:42 UTC 2024
On 2024-07-24 13:56, Jonathan Kim wrote:
> Certain GPUs have better copy performance over xGMI on specific
> SDMA engines depending on the source and destination GPU.
> Allow users to create SDMA queues on these recommended engines.
> Close to 2x overall performance has been observed with this
> optimization.
>
> v2: remove unnecessary crat updates and refactor sdma resource
> bit setting logic.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 16 ++++++
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 38 +++++++++++++-
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +-
> .../amd/amdkfd/kfd_process_queue_manager.c | 1 +
> drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 52 +++++++++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 1 +
> include/uapi/linux/kfd_ioctl.h | 6 ++-
> 7 files changed, 116 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 32e5db509560..9610cb90a47e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -255,6 +255,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
> args->ctx_save_restore_address;
> q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
> q_properties->ctl_stack_size = args->ctl_stack_size;
> + q_properties->sdma_engine_id = args->sdma_engine_id;
> if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
> args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
> q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
> @@ -262,6 +263,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
> q_properties->type = KFD_QUEUE_TYPE_SDMA;
> else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
> q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
> + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID)
> + q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID;
> else
> return -ENOTSUPP;
>
> @@ -334,6 +337,18 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
> goto err_bind_process;
> }
>
> + if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
> + int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) +
> + kfd_get_num_xgmi_sdma_engines(dev) - 1;
> +
> + if (q_properties.sdma_engine_id > max_sdma_eng_id) {
> + err = -EINVAL;
> + pr_err("sdma_engine_id %i exceeds maximum id of %i\n",
> + q_properties.sdma_engine_id, max_sdma_eng_id);
> + goto err_sdma_engine_id;
> + }
> + }
> +
> if (!pdd->qpd.proc_doorbells) {
> err = kfd_alloc_process_doorbells(dev->kfd, pdd);
> if (err) {
> @@ -425,6 +440,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
> if (wptr_bo)
> amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
> err_wptr_map_gart:
> +err_sdma_engine_id:
> err_bind_process:
> err_pdd:
> mutex_unlock(&p->mutex);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 4f48507418d2..69315885519d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1534,6 +1534,41 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
> q->properties.sdma_queue_id = q->sdma_id /
> kfd_get_num_xgmi_sdma_engines(dqm->dev);
> + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
> + int i, num_queues, num_engines, eng_offset = 0, start_engine;
> + bool free_bit_found = false, is_xgmi = false;
> +
> + if (q->properties.sdma_engine_id < kfd_get_num_sdma_engines(dqm->dev)) {
> + num_queues = get_num_sdma_queues(dqm);
> + num_engines = kfd_get_num_sdma_engines(dqm->dev);
> + q->properties.type = KFD_QUEUE_TYPE_SDMA;
> + } else {
> + num_queues = get_num_xgmi_sdma_queues(dqm);
> + num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev);
> + eng_offset = kfd_get_num_sdma_engines(dqm->dev);
> + q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI;
> + is_xgmi = true;
> + }
> +
> + /* Scan available bit based on target engine ID. */
> + start_engine = q->properties.sdma_engine_id - eng_offset;
> + for (i = start_engine; i < num_queues; i += num_engines) {
> +
> + if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap))
> + continue;
> +
> + clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap);
> + q->sdma_id = i;
> + q->properties.sdma_queue_id = q->sdma_id / num_engines;
> + free_bit_found = true;
> + break;
> + }
> +
> + if (!free_bit_found) {
> + dev_err(dev, "No more SDMA queue to allocate for target ID %i\n",
> + q->properties.sdma_engine_id);
> + return -ENOMEM;
> + }
> }
>
> pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> @@ -1786,7 +1821,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> }
>
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> - q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI ||
> + q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
> dqm_lock(dqm);
> retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id : NULL);
> dqm_unlock(dqm);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 2b3ec92981e8..7d26e71dfd04 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -414,13 +414,16 @@ enum kfd_unmap_queues_filter {
> * @KFD_QUEUE_TYPE_DIQ: DIQ queue type.
> *
> * @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI interface.
> + *
> + * @KFD_QUEUE_TYPE_SDMA_BY_ENG_ID: SDMA user mode queue with target SDMA engine ID.
> */
> enum kfd_queue_type {
> KFD_QUEUE_TYPE_COMPUTE,
> KFD_QUEUE_TYPE_SDMA,
> KFD_QUEUE_TYPE_HIQ,
> KFD_QUEUE_TYPE_DIQ,
> - KFD_QUEUE_TYPE_SDMA_XGMI
> + KFD_QUEUE_TYPE_SDMA_XGMI,
> + KFD_QUEUE_TYPE_SDMA_BY_ENG_ID
> };
>
> enum kfd_queue_format {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 21f5a1fb3bf8..8adf20760e67 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -345,6 +345,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
> switch (type) {
> case KFD_QUEUE_TYPE_SDMA:
> case KFD_QUEUE_TYPE_SDMA_XGMI:
> + case KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:
> /* SDMA queues are always allocated statically no matter
> * which scheduler mode is used. We also do not need to
> * check whether a SDMA queue can be allocated here, because
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 6f89b06f89d3..f6effaabd4b0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -292,6 +292,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr,
> iolink->max_bandwidth);
> sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size",
> iolink->rec_transfer_size);
> + sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask",
> + iolink->rec_sdma_eng_id_mask);
> sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags);
>
> return offs;
> @@ -1265,6 +1267,55 @@ static void kfd_set_iolink_non_coherent(struct kfd_topology_device *to_dev,
> }
> }
>
> +#define REC_SDMA_NUM_GPU 8
> +static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = {
> + { -1, 14, 12, 2, 4, 8, 10, 6 },
> + { 14, -1, 2, 10, 8, 4, 6, 12 },
> + { 10, 2, -1, 12, 14, 6, 4, 8 },
> + { 2, 12, 10, -1, 6, 14, 8, 4 },
> + { 4, 8, 14, 6, -1, 10, 12, 2 },
> + { 8, 4, 6, 14, 12, -1, 2, 10 },
> + { 10, 6, 4, 8, 12, 2, -1, 14 },
> + { 6, 12, 8, 4, 2, 10, 14, -1 }};
> +
> +static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev,
> + struct kfd_iolink_properties *outbound_link,
> + struct kfd_iolink_properties *inbound_link)
> +{
> + struct kfd_node *gpu = outbound_link->gpu;
> + struct amdgpu_device *adev = gpu->adev;
> + int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes;
> + bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu &&
> + adev->aid_mask && num_xgmi_nodes &&
> + (amdgpu_xcp_query_partition_mode(adev->xcp_mgr, AMDGPU_XCP_FL_NONE) ==
> + AMDGPU_SPX_PARTITION_MODE) &&
> + (!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8);
> +
> + if (support_rec_eng) {
> + int src_socket_id = adev->gmc.xgmi.physical_node_id;
> + int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id;
> +
> + outbound_link->rec_sdma_eng_id_mask =
> + 1 << rec_sdma_eng_map[src_socket_id][dst_socket_id];
> + inbound_link->rec_sdma_eng_id_mask =
> + 1 << rec_sdma_eng_map[dst_socket_id][src_socket_id];
> + } else {
> + int num_sdma_eng = kfd_get_num_sdma_engines(gpu);
> + int i, eng_offset = 0;
> +
> + if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI &&
> + kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) {
> + eng_offset = num_sdma_eng;
> + num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu);
> + }
> +
> + for (i = 0; i < num_sdma_eng; i++) {
> + outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
> + inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
> + }
> + }
> +}
> +
> static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
> {
> struct kfd_iolink_properties *link, *inbound_link;
> @@ -1303,6 +1354,7 @@ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
> inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED;
> kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link);
> kfd_set_iolink_non_coherent(peer_dev, link, inbound_link);
> + kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link);
> }
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> index 2d1c9d771bef..43ba67890f2c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> @@ -121,6 +121,7 @@ struct kfd_iolink_properties {
> uint32_t min_bandwidth;
> uint32_t max_bandwidth;
> uint32_t rec_transfer_size;
> + uint32_t rec_sdma_eng_id_mask;
> uint32_t flags;
> struct kfd_node *gpu;
> struct kobject *kobj;
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 285a36601dc9..71a7ce5f2d4c 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -42,9 +42,10 @@
> * - 1.14 - Update kfd_event_data
> * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
> * - 1.16 - Add contiguous VRAM allocation flag
> + * - 1.17 - Add SDMA queue creation with target SDMA engine ID
> */
> #define KFD_IOCTL_MAJOR_VERSION 1
> -#define KFD_IOCTL_MINOR_VERSION 16
> +#define KFD_IOCTL_MINOR_VERSION 17
>
> struct kfd_ioctl_get_version_args {
> __u32 major_version; /* from KFD */
> @@ -56,6 +57,7 @@ struct kfd_ioctl_get_version_args {
> #define KFD_IOC_QUEUE_TYPE_SDMA 0x1
> #define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 0x2
> #define KFD_IOC_QUEUE_TYPE_SDMA_XGMI 0x3
> +#define KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID 0x4
>
> #define KFD_MAX_QUEUE_PERCENTAGE 100
> #define KFD_MAX_QUEUE_PRIORITY 15
> @@ -78,6 +80,8 @@ struct kfd_ioctl_create_queue_args {
> __u64 ctx_save_restore_address; /* to KFD */
> __u32 ctx_save_restore_size; /* to KFD */
> __u32 ctl_stack_size; /* to KFD */
> + __u32 sdma_engine_id; /* to KFD */
> + __u32 pad;
> };
>
> struct kfd_ioctl_destroy_queue_args {
More information about the amd-gfx
mailing list