[PATCH 2/3] drm/amdkfd: report xgmi bandwidth between direct peers to the kfd

Felix Kuehling felix.kuehling at amd.com
Sat Jul 17 05:46:41 UTC 2021


Am 2021-07-16 um 12:43 p.m. schrieb Jonathan Kim:
> Report the min/max bandwidth in megabytes to the kfd for direct
> xgmi connections only.

By "direct XGMI connections", you mean this doesn't work for links with
more than one hop? Will that spew out DRM_ERROR messages for such links?
Then it's probably better to downgrade that to an INFO.


>
> v2: change reporting from num links to bandwidth
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>

This patch is OK to provide bandwidth information on Aldebaran. What can
we do on older GPUs? Can we assume num_links = 1? Or maybe have some
hard-coded numbers depending on the number of nodes in the hive?

Either way, patch 1 and 2 are

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 23 ++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 12 +++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_crat.c      | 12 +++++++++++
>  5 files changed, 50 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index bfab2f9fdd17..3978578a1c49 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -553,6 +553,29 @@ uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *s
>  	return  (uint8_t)ret;
>  }
>  
> +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst, struct kgd_dev *src, bool is_min)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dst, *peer_adev;
> +	int num_links;
> +
> +	if (adev->asic_type != CHIP_ALDEBARAN)
> +		return 0;
> +
> +	if (src)
> +		peer_adev = (struct amdgpu_device *)src;
> +
> +	num_links = is_min ? 1 : amdgpu_xgmi_get_num_links(adev, peer_adev);
> +	if (num_links < 0) {
> +		DRM_ERROR("amdgpu: failed to get xgmi num links between node %d and %d. ret = %d\n",
> +			adev->gmc.xgmi.physical_node_id,
> +			peer_adev->gmc.xgmi.physical_node_id, num_links);
> +		num_links = 0;
> +	}
> +
> +	/* Aldebaran xGMI DPM is defeatured so assume x16 x 25Gbps for bandwidth. */
> +	return (num_links * 16 * 25000)/BITS_PER_BYTE;
> +}
> +
>  uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev *kgd)
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 81264517d532..e12fccb2d2c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -226,6 +226,7 @@ uint32_t amdgpu_amdkfd_get_num_gws(struct kgd_dev *kgd);
>  uint32_t amdgpu_amdkfd_get_asic_rev_id(struct kgd_dev *kgd);
>  int amdgpu_amdkfd_get_noretry(struct kgd_dev *kgd);
>  uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src);
> +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst, struct kgd_dev *src, bool is_min);
>  
>  /* Read user wptr from a specified user address space with page fault
>   * disabled. The memory must be pinned and mapped to the hardware when
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 8567d5d77346..258cf86b32f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -486,6 +486,18 @@ int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
>  	return	-EINVAL;
>  }
>  
> +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> +		struct amdgpu_device *peer_adev)
> +{
> +	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
> +	int i;
> +
> +	for (i = 0 ; i < top->num_nodes; ++i)
> +		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
> +			return top->nodes[i].num_links;
> +	return	-EINVAL;
> +}
> +
>  int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>  {
>  	struct psp_xgmi_topology_info *top_info;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 12969c0830d5..d2189bf7d428 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -59,6 +59,8 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>  int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
>  int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
>  		struct amdgpu_device *peer_adev);
> +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> +		struct amdgpu_device *peer_adev);
>  uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
>  					   uint64_t addr);
>  static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> index c6b02aee4993..40ce6239c813 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> @@ -1989,6 +1989,13 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
>  		sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
>  		sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
>  		sub_type_hdr->num_hops_xgmi = 1;
> +		if (adev->asic_type == CHIP_ALDEBARAN) {
> +			sub_type_hdr->minimum_bandwidth_mbs =
> +					amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(
> +							kdev->kgd, NULL, true);
> +			sub_type_hdr->maximum_bandwidth_mbs =
> +					sub_type_hdr->minimum_bandwidth_mbs;
> +		}
>  	} else {
>  		sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
>  	}
> @@ -2033,6 +2040,11 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
>  	sub_type_hdr->proximity_domain_to = proximity_domain_to;
>  	sub_type_hdr->num_hops_xgmi =
>  		amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd, peer_kdev->kgd);
> +	sub_type_hdr->maximum_bandwidth_mbs =
> +		amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd, peer_kdev->kgd, false);
> +	sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr->maximum_bandwidth_mbs ?
> +		amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd, NULL, true) : 0;
> +
>  	return 0;
>  }
>  


More information about the amd-gfx mailing list