[PATCH 1/2] drm/amdgpu: Implement get num of hops between two xgmi device

Kuehling, Felix Felix.Kuehling at amd.com
Tue Apr 23 18:09:01 UTC 2019


See inline.

On 2019-04-17 2:58 p.m., Liu, Shaoyun wrote:
> KFD need to provide the info for upper level to determine the data path
>
> Change-Id: Idc809e8f3381b9222dd7be96539522d440f3ee7d
> Signed-off-by: shaoyunl <shaoyun.liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 15 +++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 21 +++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
>   4 files changed, 39 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index acf8ae0..3fe9a38 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include <linux/module.h>
>   #include <linux/dma-buf.h>
> +#include "amdgpu_xgmi.h"
>   
>   static const unsigned int compute_vmid_bitmap = 0xFF00;
>   
> @@ -481,6 +482,20 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
>   
>   	return adev->gmc.xgmi.hive_id;
>   }
> +uint32_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src)
> +{
> +	struct amdgpu_device *peer_adev = (struct amdgpu_device *)src;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dst;
> +	int ret = amdgpu_xgmi_get_hops_count(adev, peer_adev);
> +
> +	if (ret < 0) {
> +		DRM_ERROR("amdgpu: failed to get  xgmi hops count between node %d and %d. ret = %d\n",
> +			adev->gmc.xgmi.physical_node_id,
> +			peer_adev->gmc.xgmi.physical_node_id, ret);
> +		ret = 0;
> +	}
> +	return  (uint32_t)ret;
> +}
>   
>   int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
>   				uint32_t vmid, uint64_t gpu_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index e6a5037..8cc8a5a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -154,6 +154,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
>   				  uint32_t *flags);
>   uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
>   uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
> +uint32_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src);
>   
>   #define read_user_wptr(mmptr, wptr, dst)				\
>   	({								\
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index a48c84c..eaebd47 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -248,6 +248,27 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
>   	return ret;
>   }
>   
> +
> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> +		struct amdgpu_device *peer_adev)
> +{
> +	struct psp_xgmi_topology_info *top;
> +	int ret = 0;
> +
> +	top = kzalloc(sizeof(struct psp_xgmi_topology_info), GFP_KERNEL);

Where does this get freed? Looks like you have a memory leak.

Also, instead of allocating a new copy and querying the info from the 
PSP many times, could the psp_xgmi_topology_info structure be persistent 
somewhere in adev. I think after the driver initialization it won't 
change any more.

Regards,
   Felix


> +	if (!top)
> +		return -ENOMEM;
> +	top->num_nodes = 1;
> +	top->nodes[0].node_id = peer_adev->gmc.xgmi.node_id;
> +	ret = psp_xgmi_get_topology_info(&adev->psp, 1, top);
> +	if (ret) {
> +		dev_err(adev->dev,
> +			"XGMI: Failed to get topology info\n");
> +		return ret;
> +	}
> +	return top->nodes[0].num_hops;
> +}
> +
>   int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   {
>   	struct psp_xgmi_topology_info *hive_topology;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 3e9c91e..8a945bf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -41,6 +41,8 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
>   int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
>   void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>   int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> +		struct amdgpu_device *peer_adev);
>   
>   static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
>   		struct amdgpu_device *bo_adev)


More information about the amd-gfx mailing list