[PATCH 1/2] drm/amdgpu: Implement get num of hops between two xgmi device

Kuehling, Felix Felix.Kuehling at amd.com
Tue Apr 23 21:07:21 UTC 2019


It seems to me that amdgpu_hive_info is a driver-internal structure, but 
the psp_xpmi_topology structures are an interface with the PSP that may 
change in future ASIC generations. So on second thought, adding the 
psp_xgmi_topology structures to the psp_xgmi_context (or 
amdgpu_hive_info) like that is probably a bad idea. The structures 
should probably be defined only in psp_v11_0.c and opaque for the rest 
of the driver.

Anyway, this is getting into a bigger cleanup that is not directly 
related to this change. We'll probably have to deal with this sooner or 
later, when a new PSP version changes the XGMI interfaces.

Either way, the series is Reviewed-by: Felix Kuehling 
<Felix.Kuehling at amd.com>

On 2019-04-23 4:21 p.m., Liu, Shaoyun wrote:
> KFD need to provide the info for upper level to determine the data path
>
> Change-Id: Idc809e8f3381b9222dd7be96539522d440f3ee7d
> Signed-off-by: shaoyunl <shaoyun.liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 15 +++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h    | 26 ++++++++++++++------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 23 ++++++++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  3 ++-
>   5 files changed, 50 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index acf8ae0..8f8523a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include <linux/module.h>
>   #include <linux/dma-buf.h>
> +#include "amdgpu_xgmi.h"
>   
>   static const unsigned int compute_vmid_bitmap = 0xFF00;
>   
> @@ -481,6 +482,20 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
>   
>   	return adev->gmc.xgmi.hive_id;
>   }
> +uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src)
> +{
> +	struct amdgpu_device *peer_adev = (struct amdgpu_device *)src;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dst;
> +	int ret = amdgpu_xgmi_get_hops_count(adev, peer_adev);
> +
> +	if (ret < 0) {
> +		DRM_ERROR("amdgpu: failed to get  xgmi hops count between node %d and %d. ret = %d\n",
> +			adev->gmc.xgmi.physical_node_id,
> +			peer_adev->gmc.xgmi.physical_node_id, ret);
> +		ret = 0;
> +	}
> +	return  (uint8_t)ret;
> +}
>   
>   int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
>   				uint32_t vmid, uint64_t gpu_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index e6a5037..b0cb94d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -154,6 +154,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
>   				  uint32_t *flags);
>   uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
>   uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
> +uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src);
>   
>   #define read_user_wptr(mmptr, wptr, dst)				\
>   	({								\
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index cde113f..acbc18b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -95,12 +95,26 @@ struct psp_funcs
>   	int (*ras_cure_posion)(struct psp_context *psp, uint64_t *mode_ptr);
>   };
>   
> +#define AMDGPU_XGMI_MAX_CONNECTED_NODES		64
> +struct psp_xgmi_node_info {
> +	uint64_t				node_id;
> +	uint8_t					num_hops;
> +	uint8_t					is_sharing_enabled;
> +	enum ta_xgmi_assigned_sdma_engine	sdma_engine;
> +};
> +
> +struct psp_xgmi_topology_info {
> +	uint32_t			num_nodes;
> +	struct psp_xgmi_node_info	nodes[AMDGPU_XGMI_MAX_CONNECTED_NODES];
> +};
> +
>   struct psp_xgmi_context {
>   	uint8_t				initialized;
>   	uint32_t			session_id;
>   	struct amdgpu_bo                *xgmi_shared_bo;
>   	uint64_t                        xgmi_shared_mc_addr;
>   	void                            *xgmi_shared_buf;
> +	struct psp_xgmi_topology_info	top_info;
>   };
>   
>   struct psp_ras_context {
> @@ -181,18 +195,6 @@ struct amdgpu_psp_funcs {
>   					enum AMDGPU_UCODE_ID);
>   };
>   
> -#define AMDGPU_XGMI_MAX_CONNECTED_NODES		64
> -struct psp_xgmi_node_info {
> -	uint64_t				node_id;
> -	uint8_t					num_hops;
> -	uint8_t					is_sharing_enabled;
> -	enum ta_xgmi_assigned_sdma_engine	sdma_engine;
> -};
> -
> -struct psp_xgmi_topology_info {
> -	uint32_t			num_nodes;
> -	struct psp_xgmi_node_info	nodes[AMDGPU_XGMI_MAX_CONNECTED_NODES];
> -};
>   
>   #define psp_ring_init(psp, type) (psp)->funcs->ring_init((psp), (type))
>   #define psp_ring_create(psp, type) (psp)->funcs->ring_create((psp), (type))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index a48c84c..04dfc8b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -238,7 +238,7 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
>   	/* Each psp need to set the latest topology */
>   	ret = psp_xgmi_set_topology_info(&adev->psp,
>   					 hive->number_devices,
> -					 &hive->topology_info);
> +					 &adev->psp.xgmi_context.top_info);
>   	if (ret)
>   		dev_err(adev->dev,
>   			"XGMI: Set topology failure on device %llx, hive %llx, ret %d",
> @@ -248,9 +248,22 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
>   	return ret;
>   }
>   
> +
> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> +		struct amdgpu_device *peer_adev)
> +{
> +	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
> +	int i;
> +
> +	for (i = 0 ; i < top->num_nodes; ++i)
> +		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
> +			return top->nodes[i].num_hops;
> +	return	-EINVAL;
> +}
> +
>   int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   {
> -	struct psp_xgmi_topology_info *hive_topology;
> +	struct psp_xgmi_topology_info *top_info;
>   	struct amdgpu_hive_info *hive;
>   	struct amdgpu_xgmi	*entry;
>   	struct amdgpu_device *tmp_adev = NULL;
> @@ -283,16 +296,16 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   		goto exit;
>   	}
>   
> -	hive_topology = &hive->topology_info;
> +	top_info = &adev->psp.xgmi_context.top_info;
>   
>   	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
>   	list_for_each_entry(entry, &hive->device_list, head)
> -		hive_topology->nodes[count++].node_id = entry->node_id;
> +		top_info->nodes[count++].node_id = entry->node_id;
>   	hive->number_devices = count;
>   
>   	/* Each psp need to get the latest topology */
>   	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> -		ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, hive_topology);
> +		ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, top_info);
>   		if (ret) {
>   			dev_err(tmp_adev->dev,
>   				"XGMI: Get topology failure on device %llx, hive %llx, ret %d",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 3e9c91e..fbcee31 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -27,7 +27,6 @@
>   struct amdgpu_hive_info {
>   	uint64_t		hive_id;
>   	struct list_head	device_list;
> -	struct psp_xgmi_topology_info	topology_info;
>   	int number_devices;
>   	struct mutex hive_lock, reset_lock;
>   	struct kobject *kobj;
> @@ -41,6 +40,8 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
>   int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
>   void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>   int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> +		struct amdgpu_device *peer_adev);
>   
>   static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
>   		struct amdgpu_device *bo_adev)


More information about the amd-gfx mailing list