[PATCH 1/2] drm/amdgpu: Implement get num of hops between two xgmi device
Kuehling, Felix
Felix.Kuehling at amd.com
Tue Apr 23 21:07:21 UTC 2019
It seems to me that amdgpu_hive_info is a driver-internal structure, but
the psp_xpmi_topology structures are an interface with the PSP that may
change in future ASIC generations. So on second thought, adding the
psp_xgmi_topology structures to the psp_xgmi_context (or
amdgpu_hive_info) like that is probably a bad idea. The structures
should probably be defined only in psp_v11_0.c and opaque for the rest
of the driver.
Anyway, this is getting into a bigger cleanup that is not directly
related to this change. We'll probably have to deal with this sooner or
later, when a new PSP version changes the XGMI interfaces.
Either way, the series is Reviewed-by: Felix Kuehling
<Felix.Kuehling at amd.com>
On 2019-04-23 4:21 p.m., Liu, Shaoyun wrote:
> KFD need to provide the info for upper level to determine the data path
>
> Change-Id: Idc809e8f3381b9222dd7be96539522d440f3ee7d
> Signed-off-by: shaoyunl <shaoyun.liu at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 15 +++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 26 ++++++++++++++------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 23 ++++++++++++++++++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 3 ++-
> 5 files changed, 50 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index acf8ae0..8f8523a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -27,6 +27,7 @@
> #include "amdgpu_gfx.h"
> #include <linux/module.h>
> #include <linux/dma-buf.h>
> +#include "amdgpu_xgmi.h"
>
> static const unsigned int compute_vmid_bitmap = 0xFF00;
>
> @@ -481,6 +482,20 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
>
> return adev->gmc.xgmi.hive_id;
> }
> +uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src)
> +{
> + struct amdgpu_device *peer_adev = (struct amdgpu_device *)src;
> + struct amdgpu_device *adev = (struct amdgpu_device *)dst;
> + int ret = amdgpu_xgmi_get_hops_count(adev, peer_adev);
> +
> + if (ret < 0) {
> + DRM_ERROR("amdgpu: failed to get xgmi hops count between node %d and %d. ret = %d\n",
> + adev->gmc.xgmi.physical_node_id,
> + peer_adev->gmc.xgmi.physical_node_id, ret);
> + ret = 0;
> + }
> + return (uint8_t)ret;
> +}
>
> int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
> uint32_t vmid, uint64_t gpu_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index e6a5037..b0cb94d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -154,6 +154,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
> uint32_t *flags);
> uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
> uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
> +uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src);
>
> #define read_user_wptr(mmptr, wptr, dst) \
> ({ \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index cde113f..acbc18b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -95,12 +95,26 @@ struct psp_funcs
> int (*ras_cure_posion)(struct psp_context *psp, uint64_t *mode_ptr);
> };
>
> +#define AMDGPU_XGMI_MAX_CONNECTED_NODES 64
> +struct psp_xgmi_node_info {
> + uint64_t node_id;
> + uint8_t num_hops;
> + uint8_t is_sharing_enabled;
> + enum ta_xgmi_assigned_sdma_engine sdma_engine;
> +};
> +
> +struct psp_xgmi_topology_info {
> + uint32_t num_nodes;
> + struct psp_xgmi_node_info nodes[AMDGPU_XGMI_MAX_CONNECTED_NODES];
> +};
> +
> struct psp_xgmi_context {
> uint8_t initialized;
> uint32_t session_id;
> struct amdgpu_bo *xgmi_shared_bo;
> uint64_t xgmi_shared_mc_addr;
> void *xgmi_shared_buf;
> + struct psp_xgmi_topology_info top_info;
> };
>
> struct psp_ras_context {
> @@ -181,18 +195,6 @@ struct amdgpu_psp_funcs {
> enum AMDGPU_UCODE_ID);
> };
>
> -#define AMDGPU_XGMI_MAX_CONNECTED_NODES 64
> -struct psp_xgmi_node_info {
> - uint64_t node_id;
> - uint8_t num_hops;
> - uint8_t is_sharing_enabled;
> - enum ta_xgmi_assigned_sdma_engine sdma_engine;
> -};
> -
> -struct psp_xgmi_topology_info {
> - uint32_t num_nodes;
> - struct psp_xgmi_node_info nodes[AMDGPU_XGMI_MAX_CONNECTED_NODES];
> -};
>
> #define psp_ring_init(psp, type) (psp)->funcs->ring_init((psp), (type))
> #define psp_ring_create(psp, type) (psp)->funcs->ring_create((psp), (type))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index a48c84c..04dfc8b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -238,7 +238,7 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
> /* Each psp need to set the latest topology */
> ret = psp_xgmi_set_topology_info(&adev->psp,
> hive->number_devices,
> - &hive->topology_info);
> + &adev->psp.xgmi_context.top_info);
> if (ret)
> dev_err(adev->dev,
> "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
> @@ -248,9 +248,22 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
> return ret;
> }
>
> +
> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> + struct amdgpu_device *peer_adev)
> +{
> + struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
> + int i;
> +
> + for (i = 0 ; i < top->num_nodes; ++i)
> + if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
> + return top->nodes[i].num_hops;
> + return -EINVAL;
> +}
> +
> int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
> {
> - struct psp_xgmi_topology_info *hive_topology;
> + struct psp_xgmi_topology_info *top_info;
> struct amdgpu_hive_info *hive;
> struct amdgpu_xgmi *entry;
> struct amdgpu_device *tmp_adev = NULL;
> @@ -283,16 +296,16 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
> goto exit;
> }
>
> - hive_topology = &hive->topology_info;
> + top_info = &adev->psp.xgmi_context.top_info;
>
> list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
> list_for_each_entry(entry, &hive->device_list, head)
> - hive_topology->nodes[count++].node_id = entry->node_id;
> + top_info->nodes[count++].node_id = entry->node_id;
> hive->number_devices = count;
>
> /* Each psp need to get the latest topology */
> list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> - ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, hive_topology);
> + ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, top_info);
> if (ret) {
> dev_err(tmp_adev->dev,
> "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 3e9c91e..fbcee31 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -27,7 +27,6 @@
> struct amdgpu_hive_info {
> uint64_t hive_id;
> struct list_head device_list;
> - struct psp_xgmi_topology_info topology_info;
> int number_devices;
> struct mutex hive_lock, reset_lock;
> struct kobject *kobj;
> @@ -41,6 +40,8 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
> int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
> void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
> int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> + struct amdgpu_device *peer_adev);
>
> static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
> struct amdgpu_device *bo_adev)
More information about the amd-gfx
mailing list