[PATCH 2/3] drm/amdkfd: report xgmi bandwidth between direct peers to the kfd
Kim, Jonathan
Jonathan.Kim at amd.com
Mon Jul 19 16:02:46 UTC 2021
[AMD Official Use Only]
> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling at amd.com>
> Sent: Saturday, July 17, 2021 1:47 AM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-
> gfx at lists.freedesktop.org
> Subject: Re: [PATCH 2/3] drm/amdkfd: report xgmi bandwidth between
> direct peers to the kfd
>
> Am 2021-07-16 um 12:43 p.m. schrieb Jonathan Kim:
> > Report the min/max bandwidth in megabytes to the kfd for direct xgmi
> > connections only.
>
> By "direct XGMI connections", you mean this doesn't work for links with
> more than one hop? Will that spew out DRM_ERROR messages for such links?
> Then it's probably better to downgrade that to an INFO.
No DRM_ERROR only happens if psp fails on invoke.
I've added footnotes to the description and code to clear this up.
Non-adjacent peers return num_links as 0 since indirect route is unknown and linkage is asymmetrical.
>
>
> >
> > v2: change reporting from num links to bandwidth
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
>
> This patch is OK to provide bandwidth information on Aldebaran. What can
> we do on older GPUs? Can we assume num_links = 1? Or maybe have some
> hard-coded numbers depending on the number of nodes in the hive?
We could assume num_links = 1 but that wouldn't represent certain non-Aldebaran setups well.
For non-Aldebaran min/max bandwidth, we may be able to get away with setting non-zero values on non-adjacent peers since setup is symmetrical to date but that may raise questions on why Aldebaran indirect min/max-bandwidth is 0. For consistency, we'd have to use num_hops then to check directness.
Maybe it's worth making a bid to the FW team to support all other chips moving forward ...
Thanks,
Jon
>
> Either way, patch 1 and 2 are
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 23
> > ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
> > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 12 +++++++++++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
> > drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 12 +++++++++++
> > 5 files changed, 50 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > index bfab2f9fdd17..3978578a1c49 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > @@ -553,6 +553,29 @@ uint8_t
> amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev
> *s
> > return (uint8_t)ret;
> > }
> >
> > +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst,
> > +struct kgd_dev *src, bool is_min) {
> > + struct amdgpu_device *adev = (struct amdgpu_device *)dst,
> *peer_adev;
> > + int num_links;
> > +
> > + if (adev->asic_type != CHIP_ALDEBARAN)
> > + return 0;
> > +
> > + if (src)
> > + peer_adev = (struct amdgpu_device *)src;
> > +
> > + num_links = is_min ? 1 : amdgpu_xgmi_get_num_links(adev,
> peer_adev);
> > + if (num_links < 0) {
> > + DRM_ERROR("amdgpu: failed to get xgmi num links between
> node %d and %d. ret = %d\n",
> > + adev->gmc.xgmi.physical_node_id,
> > + peer_adev->gmc.xgmi.physical_node_id, num_links);
> > + num_links = 0;
> > + }
> > +
> > + /* Aldebaran xGMI DPM is defeatured so assume x16 x 25Gbps for
> bandwidth. */
> > + return (num_links * 16 * 25000)/BITS_PER_BYTE; }
> > +
> > uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev
> *kgd)
> > {
> > struct amdgpu_device *adev = (struct amdgpu_device *)kgd; diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > index 81264517d532..e12fccb2d2c4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > @@ -226,6 +226,7 @@ uint32_t amdgpu_amdkfd_get_num_gws(struct
> kgd_dev
> > *kgd); uint32_t amdgpu_amdkfd_get_asic_rev_id(struct kgd_dev *kgd);
> > int amdgpu_amdkfd_get_noretry(struct kgd_dev *kgd); uint8_t
> > amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct
> kgd_dev
> > *src);
> > +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst,
> > +struct kgd_dev *src, bool is_min);
> >
> > /* Read user wptr from a specified user address space with page fault
> > * disabled. The memory must be pinned and mapped to the hardware
> > when diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > index 8567d5d77346..258cf86b32f6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > @@ -486,6 +486,18 @@ int amdgpu_xgmi_get_hops_count(struct
> amdgpu_device *adev,
> > return -EINVAL;
> > }
> >
> > +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> > + struct amdgpu_device *peer_adev)
> > +{
> > + struct psp_xgmi_topology_info *top = &adev-
> >psp.xgmi_context.top_info;
> > + int i;
> > +
> > + for (i = 0 ; i < top->num_nodes; ++i)
> > + if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
> > + return top->nodes[i].num_links;
> > + return -EINVAL;
> > +}
> > +
> > int amdgpu_xgmi_add_device(struct amdgpu_device *adev) {
> > struct psp_xgmi_topology_info *top_info; diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > index 12969c0830d5..d2189bf7d428 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > @@ -59,6 +59,8 @@ int amdgpu_xgmi_remove_device(struct
> amdgpu_device
> > *adev); int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int
> > pstate); int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> > struct amdgpu_device *peer_adev);
> > +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> > + struct amdgpu_device *peer_adev);
> > uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device
> *adev,
> > uint64_t addr);
> > static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > index c6b02aee4993..40ce6239c813 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > @@ -1989,6 +1989,13 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int
> *avail_size,
> > sub_type_hdr->flags |=
> CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
> > sub_type_hdr->io_interface_type =
> CRAT_IOLINK_TYPE_XGMI;
> > sub_type_hdr->num_hops_xgmi = 1;
> > + if (adev->asic_type == CHIP_ALDEBARAN) {
> > + sub_type_hdr->minimum_bandwidth_mbs =
> > +
> amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(
> > + kdev->kgd, NULL,
> true);
> > + sub_type_hdr->maximum_bandwidth_mbs =
> > + sub_type_hdr-
> >minimum_bandwidth_mbs;
> > + }
> > } else {
> > sub_type_hdr->io_interface_type =
> CRAT_IOLINK_TYPE_PCIEXPRESS;
> > }
> > @@ -2033,6 +2040,11 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int
> *avail_size,
> > sub_type_hdr->proximity_domain_to = proximity_domain_to;
> > sub_type_hdr->num_hops_xgmi =
> > amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd,
> peer_kdev->kgd);
> > + sub_type_hdr->maximum_bandwidth_mbs =
> > + amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd,
> peer_kdev->kgd, false);
> > + sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr-
> >maximum_bandwidth_mbs ?
> > + amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd,
> NULL, true) : 0;
> > +
> > return 0;
> > }
> >
More information about the amd-gfx
mailing list