[PATCH 2/3] drm/amdkfd: report xgmi bandwidth between direct peers to the kfd

Kim, Jonathan Jonathan.Kim at amd.com
Mon Jul 19 16:02:46 UTC 2021


[AMD Official Use Only]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling at amd.com>
> Sent: Saturday, July 17, 2021 1:47 AM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-
> gfx at lists.freedesktop.org
> Subject: Re: [PATCH 2/3] drm/amdkfd: report xgmi bandwidth between
> direct peers to the kfd
>
> Am 2021-07-16 um 12:43 p.m. schrieb Jonathan Kim:
> > Report the min/max bandwidth in megabytes to the kfd for direct xgmi
> > connections only.
>
> By "direct XGMI connections", you mean this doesn't work for links with
> more than one hop? Will that spew out DRM_ERROR messages for such links?
> Then it's probably better to downgrade that to an INFO.

No DRM_ERROR only happens if psp fails on invoke.
I've added footnotes to the description and code to clear this up.
Non-adjacent peers return num_links as 0 since indirect route is unknown and linkage is asymmetrical.

>
>
> >
> > v2: change reporting from num links to bandwidth
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
>
> This patch is OK to provide bandwidth information on Aldebaran. What can
> we do on older GPUs? Can we assume num_links = 1? Or maybe have some
> hard-coded numbers depending on the number of nodes in the hive?

We could assume num_links = 1 but that wouldn't represent certain non-Aldebaran setups well.
For non-Aldebaran min/max bandwidth, we may be able to get away with setting non-zero values on non-adjacent peers since setup is symmetrical to date but that may raise questions on why Aldebaran indirect min/max-bandwidth is 0.  For consistency, we'd have to use num_hops then to check directness.
Maybe it's worth making a bid to the FW team to support all other chips moving forward  ...

Thanks,

Jon

>
> Either way, patch 1 and 2 are
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 23
> > ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 12 +++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
> >  drivers/gpu/drm/amd/amdkfd/kfd_crat.c      | 12 +++++++++++
> >  5 files changed, 50 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > index bfab2f9fdd17..3978578a1c49 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > @@ -553,6 +553,29 @@ uint8_t
> amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev
> *s
> >     return  (uint8_t)ret;
> >  }
> >
> > +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst,
> > +struct kgd_dev *src, bool is_min) {
> > +   struct amdgpu_device *adev = (struct amdgpu_device *)dst,
> *peer_adev;
> > +   int num_links;
> > +
> > +   if (adev->asic_type != CHIP_ALDEBARAN)
> > +           return 0;
> > +
> > +   if (src)
> > +           peer_adev = (struct amdgpu_device *)src;
> > +
> > +   num_links = is_min ? 1 : amdgpu_xgmi_get_num_links(adev,
> peer_adev);
> > +   if (num_links < 0) {
> > +           DRM_ERROR("amdgpu: failed to get xgmi num links between
> node %d and %d. ret = %d\n",
> > +                   adev->gmc.xgmi.physical_node_id,
> > +                   peer_adev->gmc.xgmi.physical_node_id, num_links);
> > +           num_links = 0;
> > +   }
> > +
> > +   /* Aldebaran xGMI DPM is defeatured so assume x16 x 25Gbps for
> bandwidth. */
> > +   return (num_links * 16 * 25000)/BITS_PER_BYTE; }
> > +
> >  uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev
> *kgd)
> > {
> >     struct amdgpu_device *adev = (struct amdgpu_device *)kgd; diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > index 81264517d532..e12fccb2d2c4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > @@ -226,6 +226,7 @@ uint32_t amdgpu_amdkfd_get_num_gws(struct
> kgd_dev
> > *kgd);  uint32_t amdgpu_amdkfd_get_asic_rev_id(struct kgd_dev *kgd);
> > int amdgpu_amdkfd_get_noretry(struct kgd_dev *kgd);  uint8_t
> > amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct
> kgd_dev
> > *src);
> > +int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct kgd_dev *dst,
> > +struct kgd_dev *src, bool is_min);
> >
> >  /* Read user wptr from a specified user address space with page fault
> >   * disabled. The memory must be pinned and mapped to the hardware
> > when diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > index 8567d5d77346..258cf86b32f6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > @@ -486,6 +486,18 @@ int amdgpu_xgmi_get_hops_count(struct
> amdgpu_device *adev,
> >     return  -EINVAL;
> >  }
> >
> > +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> > +           struct amdgpu_device *peer_adev)
> > +{
> > +   struct psp_xgmi_topology_info *top = &adev-
> >psp.xgmi_context.top_info;
> > +   int i;
> > +
> > +   for (i = 0 ; i < top->num_nodes; ++i)
> > +           if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
> > +                   return top->nodes[i].num_links;
> > +   return  -EINVAL;
> > +}
> > +
> >  int amdgpu_xgmi_add_device(struct amdgpu_device *adev)  {
> >     struct psp_xgmi_topology_info *top_info; diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > index 12969c0830d5..d2189bf7d428 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > @@ -59,6 +59,8 @@ int amdgpu_xgmi_remove_device(struct
> amdgpu_device
> > *adev);  int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int
> > pstate);  int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> >             struct amdgpu_device *peer_adev);
> > +int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> > +           struct amdgpu_device *peer_adev);
> >  uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device
> *adev,
> >                                        uint64_t addr);
> >  static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > index c6b02aee4993..40ce6239c813 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > @@ -1989,6 +1989,13 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int
> *avail_size,
> >             sub_type_hdr->flags |=
> CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
> >             sub_type_hdr->io_interface_type =
> CRAT_IOLINK_TYPE_XGMI;
> >             sub_type_hdr->num_hops_xgmi = 1;
> > +           if (adev->asic_type == CHIP_ALDEBARAN) {
> > +                   sub_type_hdr->minimum_bandwidth_mbs =
> > +
>       amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(
> > +                                                   kdev->kgd, NULL,
> true);
> > +                   sub_type_hdr->maximum_bandwidth_mbs =
> > +                                   sub_type_hdr-
> >minimum_bandwidth_mbs;
> > +           }
> >     } else {
> >             sub_type_hdr->io_interface_type =
> CRAT_IOLINK_TYPE_PCIEXPRESS;
> >     }
> > @@ -2033,6 +2040,11 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int
> *avail_size,
> >     sub_type_hdr->proximity_domain_to = proximity_domain_to;
> >     sub_type_hdr->num_hops_xgmi =
> >             amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd,
> peer_kdev->kgd);
> > +   sub_type_hdr->maximum_bandwidth_mbs =
> > +           amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd,
> peer_kdev->kgd, false);
> > +   sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr-
> >maximum_bandwidth_mbs ?
> > +           amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->kgd,
> NULL, true) : 0;
> > +
> >     return 0;
> >  }
> >


More information about the amd-gfx mailing list