[PATCH 2/3] drm/amdkfd: report xgmi bandwidth between direct peers to the kfd
Kim, Jonathan
Jonathan.Kim at amd.com
Mon Jul 19 15:50:29 UTC 2021
[AMD Official Use Only]
> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar at amd.com>
> Sent: Monday, July 19, 2021 3:22 AM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-
> gfx at lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling at amd.com>
> Subject: Re: [PATCH 2/3] drm/amdkfd: report xgmi bandwidth between
> direct peers to the kfd
>
>
>
> On 7/16/2021 10:13 PM, Jonathan Kim wrote:
> > Report the min/max bandwidth in megabytes to the kfd for direct xgmi
> > connections only.
> >
> > v2: change reporting from num links to bandwidth
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 23
> ++++++++++++++++++++++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
> > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 12 +++++++++++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
> > drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 12 +++++++++++
> > 5 files changed, 50 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > index bfab2f9fdd17..3978578a1c49 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > @@ -553,6 +553,29 @@ uint8_t
> amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev
> *s
> > return (uint8_t)ret;
> > }
> >
> > +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst,
> > +struct kgd_dev *src, bool is_min) {
> > + struct amdgpu_device *adev = (struct amdgpu_device *)dst,
> *peer_adev;
> > + int num_links;
> > +
> > + if (adev->asic_type != CHIP_ALDEBARAN)
> > + return 0;
> > +
> > + if (src)
> > + peer_adev = (struct amdgpu_device *)src;
> > +
> > + num_links = is_min ? 1 : amdgpu_xgmi_get_num_links(adev,
> peer_adev);
> > + if (num_links < 0) {
> > + DRM_ERROR("amdgpu: failed to get xgmi num links between
> node %d and %d. ret = %d\n",
> > + adev->gmc.xgmi.physical_node_id,
> > + peer_adev->gmc.xgmi.physical_node_id, num_links);
> > + num_links = 0;
> > + }
> > +
> > + /* Aldebaran xGMI DPM is defeatured so assume x16 x 25Gbps for
> bandwidth. */
> > + return (num_links * 16 * 25000)/BITS_PER_BYTE;
>
> Instead of having ASIC family checks and bandwidth info in interface
> functions, better to have this info come from base layer (amdgpu_xgmi or
> xgmi ip). That will help to handle other ASICs.
Ok. We can revisit this as a follow up. Maybe the full solution is a link width/speed support mask analogous to pcie.
Thanks,
Jon
>
> Thanks,
> Lijo
>
> > uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev
> *kgd)
> > {
> > struct amdgpu_device *adev = (struct amdgpu_device *)kgd; diff
> > --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > index 81264517d532..e12fccb2d2c4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > @@ -226,6 +226,7 @@ uint32_t amdgpu_amdkfd_get_num_gws(struct
> kgd_dev *kgd);
> > uint32_t amdgpu_amdkfd_get_asic_rev_id(struct kgd_dev *kgd);
> > int amdgpu_amdkfd_get_noretry(struct kgd_dev *kgd);
> > uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst,
> > struct kgd_dev *src);
> > +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst,
> > +struct kgd_dev *src, bool is_min);
> >
> > /* Read user wptr from a specified user address space with page fault
> > * disabled. The memory must be pinned and mapped to the hardware
> > when diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > index 8567d5d77346..258cf86b32f6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > @@ -486,6 +486,18 @@ int amdgpu_xgmi_get_hops_count(struct
> amdgpu_device *adev,
> > return -EINVAL;
> > }
> >
> > +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> > + struct amdgpu_device *peer_adev)
> > +{
> > + struct psp_xgmi_topology_info *top = &adev-
> >psp.xgmi_context.top_info;
> > + int i;
> > +
> > + for (i = 0 ; i < top->num_nodes; ++i)
> > + if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
> > + return top->nodes[i].num_links;
> > + return -EINVAL;
> > +}
> > +
> > int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
> > {
> > struct psp_xgmi_topology_info *top_info; diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > index 12969c0830d5..d2189bf7d428 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > @@ -59,6 +59,8 @@ int amdgpu_xgmi_remove_device(struct
> amdgpu_device *adev);
> > int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
> > int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> > struct amdgpu_device *peer_adev);
> > +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> > + struct amdgpu_device *peer_adev);
> > uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device
> *adev,
> > uint64_t addr);
> > static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > index c6b02aee4993..40ce6239c813 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > @@ -1989,6 +1989,13 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int
> *avail_size,
> > sub_type_hdr->flags |=
> CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
> > sub_type_hdr->io_interface_type =
> CRAT_IOLINK_TYPE_XGMI;
> > sub_type_hdr->num_hops_xgmi = 1;
> > + if (adev->asic_type == CHIP_ALDEBARAN) {
> > + sub_type_hdr->minimum_bandwidth_mbs =
> > +
> amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(
> > + kdev->kgd, NULL,
> true);
> > + sub_type_hdr->maximum_bandwidth_mbs =
> > + sub_type_hdr-
> >minimum_bandwidth_mbs;
> > + }
> > } else {
> > sub_type_hdr->io_interface_type =
> CRAT_IOLINK_TYPE_PCIEXPRESS;
> > }
> > @@ -2033,6 +2040,11 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int
> *avail_size,
> > sub_type_hdr->proximity_domain_to = proximity_domain_to;
> > sub_type_hdr->num_hops_xgmi =
> > amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd,
> peer_kdev->kgd);
> > + sub_type_hdr->maximum_bandwidth_mbs =
> > + amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd,
> peer_kdev->kgd, false);
> > + sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr-
> >maximum_bandwidth_mbs ?
> > + amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd,
> NULL, true) : 0;
> > +
> > return 0;
> > }
> >
> >
More information about the amd-gfx
mailing list