[PATCH 2/3] drm/amdkfd: report xgmi bandwidth between direct peers to the kfd

Kim, Jonathan Jonathan.Kim at amd.com
Mon Jul 19 15:50:29 UTC 2021


[AMD Official Use Only]

> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar at amd.com>
> Sent: Monday, July 19, 2021 3:22 AM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-
> gfx at lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling at amd.com>
> Subject: Re: [PATCH 2/3] drm/amdkfd: report xgmi bandwidth between
> direct peers to the kfd
>
>
>
> On 7/16/2021 10:13 PM, Jonathan Kim wrote:
> > Report the min/max bandwidth in megabytes to the kfd for direct xgmi
> > connections only.
> >
> > v2: change reporting from num links to bandwidth
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 23
> ++++++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 12 +++++++++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
> >   drivers/gpu/drm/amd/amdkfd/kfd_crat.c      | 12 +++++++++++
> >   5 files changed, 50 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > index bfab2f9fdd17..3978578a1c49 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > @@ -553,6 +553,29 @@ uint8_t
> amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev
> *s
> >     return  (uint8_t)ret;
> >   }
> >
> > +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst,
> > +struct kgd_dev *src, bool is_min) {
> > +   struct amdgpu_device *adev = (struct amdgpu_device *)dst,
> *peer_adev;
> > +   int num_links;
> > +
> > +   if (adev->asic_type != CHIP_ALDEBARAN)
> > +           return 0;
> > +
> > +   if (src)
> > +           peer_adev = (struct amdgpu_device *)src;
> > +
> > +   num_links = is_min ? 1 : amdgpu_xgmi_get_num_links(adev,
> peer_adev);
> > +   if (num_links < 0) {
> > +           DRM_ERROR("amdgpu: failed to get xgmi num links between
> node %d and %d. ret = %d\n",
> > +                   adev->gmc.xgmi.physical_node_id,
> > +                   peer_adev->gmc.xgmi.physical_node_id, num_links);
> > +           num_links = 0;
> > +   }
> > +
> > +   /* Aldebaran xGMI DPM is defeatured so assume x16 x 25Gbps for
> bandwidth. */
> > +   return (num_links * 16 * 25000)/BITS_PER_BYTE;
>
> Instead of having ASIC family checks and bandwidth info in interface
> functions, better to have this info come from base layer (amdgpu_xgmi or
> xgmi ip). That will help to handle other ASICs.

Ok.  We can revisit this as a follow up.  Maybe the full solution is a link width/speed support mask analogous to pcie.

Thanks,

Jon

>
> Thanks,
> Lijo
>
> >   uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev
> *kgd)
> >   {
> >     struct amdgpu_device *adev = (struct amdgpu_device *)kgd; diff
> > --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > index 81264517d532..e12fccb2d2c4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > @@ -226,6 +226,7 @@ uint32_t amdgpu_amdkfd_get_num_gws(struct
> kgd_dev *kgd);
> >   uint32_t amdgpu_amdkfd_get_asic_rev_id(struct kgd_dev *kgd);
> >   int amdgpu_amdkfd_get_noretry(struct kgd_dev *kgd);
> >   uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst,
> > struct kgd_dev *src);
> > +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst,
> > +struct kgd_dev *src, bool is_min);
> >
> >   /* Read user wptr from a specified user address space with page fault
> >    * disabled. The memory must be pinned and mapped to the hardware
> > when diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > index 8567d5d77346..258cf86b32f6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > @@ -486,6 +486,18 @@ int amdgpu_xgmi_get_hops_count(struct
> amdgpu_device *adev,
> >     return  -EINVAL;
> >   }
> >
> > +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> > +           struct amdgpu_device *peer_adev)
> > +{
> > +   struct psp_xgmi_topology_info *top = &adev-
> >psp.xgmi_context.top_info;
> > +   int i;
> > +
> > +   for (i = 0 ; i < top->num_nodes; ++i)
> > +           if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
> > +                   return top->nodes[i].num_links;
> > +   return  -EINVAL;
> > +}
> > +
> >   int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
> >   {
> >     struct psp_xgmi_topology_info *top_info; diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > index 12969c0830d5..d2189bf7d428 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > @@ -59,6 +59,8 @@ int amdgpu_xgmi_remove_device(struct
> amdgpu_device *adev);
> >   int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
> >   int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> >             struct amdgpu_device *peer_adev);
> > +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> > +           struct amdgpu_device *peer_adev);
> >   uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device
> *adev,
> >                                        uint64_t addr);
> >   static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > index c6b02aee4993..40ce6239c813 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > @@ -1989,6 +1989,13 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int
> *avail_size,
> >             sub_type_hdr->flags |=
> CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
> >             sub_type_hdr->io_interface_type =
> CRAT_IOLINK_TYPE_XGMI;
> >             sub_type_hdr->num_hops_xgmi = 1;
> > +           if (adev->asic_type == CHIP_ALDEBARAN) {
> > +                   sub_type_hdr->minimum_bandwidth_mbs =
> > +
>       amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(
> > +                                                   kdev->kgd, NULL,
> true);
> > +                   sub_type_hdr->maximum_bandwidth_mbs =
> > +                                   sub_type_hdr-
> >minimum_bandwidth_mbs;
> > +           }
> >     } else {
> >             sub_type_hdr->io_interface_type =
> CRAT_IOLINK_TYPE_PCIEXPRESS;
> >     }
> > @@ -2033,6 +2040,11 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int
> *avail_size,
> >     sub_type_hdr->proximity_domain_to = proximity_domain_to;
> >     sub_type_hdr->num_hops_xgmi =
> >             amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd,
> peer_kdev->kgd);
> > +   sub_type_hdr->maximum_bandwidth_mbs =
> > +           amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd,
> peer_kdev->kgd, false);
> > +   sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr-
> >maximum_bandwidth_mbs ?
> > +           amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd,
> NULL, true) : 0;
> > +
> >     return 0;
> >   }
> >
> >


More information about the amd-gfx mailing list