[PATCH 1/2] drm/amdgpu: Implement get num of hops between two xgmi device
Liu, Shaoyun
Shaoyun.Liu at amd.com
Tue Apr 23 18:30:05 UTC 2019
comments inline.
On 2019-04-23 2:09 p.m., Kuehling, Felix wrote:
> See inline.
>
> On 2019-04-17 2:58 p.m., Liu, Shaoyun wrote:
>> KFD need to provide the info for upper level to determine the data path
>>
>> Change-Id: Idc809e8f3381b9222dd7be96539522d440f3ee7d
>> Signed-off-by: shaoyunl <shaoyun.liu at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 15 +++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 21 +++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++
>> 4 files changed, 39 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index acf8ae0..3fe9a38 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -27,6 +27,7 @@
>> #include "amdgpu_gfx.h"
>> #include <linux/module.h>
>> #include <linux/dma-buf.h>
>> +#include "amdgpu_xgmi.h"
>>
>> static const unsigned int compute_vmid_bitmap = 0xFF00;
>>
>> @@ -481,6 +482,20 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
>>
>> return adev->gmc.xgmi.hive_id;
>> }
>> +uint32_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src)
>> +{
>> + struct amdgpu_device *peer_adev = (struct amdgpu_device *)src;
>> + struct amdgpu_device *adev = (struct amdgpu_device *)dst;
>> + int ret = amdgpu_xgmi_get_hops_count(adev, peer_adev);
>> +
>> + if (ret < 0) {
>> + DRM_ERROR("amdgpu: failed to get xgmi hops count between node %d and %d. ret = %d\n",
>> + adev->gmc.xgmi.physical_node_id,
>> + peer_adev->gmc.xgmi.physical_node_id, ret);
>> + ret = 0;
>> + }
>> + return (uint32_t)ret;
>> +}
>>
>> int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
>> uint32_t vmid, uint64_t gpu_addr,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index e6a5037..8cc8a5a1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -154,6 +154,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
>> uint32_t *flags);
>> uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
>> uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
>> +uint32_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src);
>>
>> #define read_user_wptr(mmptr, wptr, dst) \
>> ({ \
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index a48c84c..eaebd47 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -248,6 +248,27 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
>> return ret;
>> }
>>
>> +
>> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
>> + struct amdgpu_device *peer_adev)
>> +{
>> + struct psp_xgmi_topology_info *top;
>> + int ret = 0;
>> +
>> + top = kzalloc(sizeof(struct psp_xgmi_topology_info), GFP_KERNEL);
> Where does this get freed? Looks like you have a memory leak.
>
> Also, instead of allocating a new copy and querying the info from the
> PSP many times, could the psp_xgmi_topology_info structure be persistent
> somewhere in adev. I think after the driver initialization it won't
> change any more.
>
> Regards,
> Felix
>
Good catch for the memory leak. I will add free before this exit
function. I think of move psp_xgmi_topology_info into adev , but seems
involves too much change . Let me re-think it .
Regards
shaoyun.liu
>
>> + if (!top)
>> + return -ENOMEM;
>> + top->num_nodes = 1;
>> + top->nodes[0].node_id = peer_adev->gmc.xgmi.node_id;
>> + ret = psp_xgmi_get_topology_info(&adev->psp, 1, top);
>> + if (ret) {
>> + dev_err(adev->dev,
>> + "XGMI: Failed to get topology info\n");
>> + return ret;
>> + }
>> + return top->nodes[0].num_hops;
>> +}
>> +
>> int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>> {
>> struct psp_xgmi_topology_info *hive_topology;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> index 3e9c91e..8a945bf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> @@ -41,6 +41,8 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
>> int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
>> void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>> int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
>> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
>> + struct amdgpu_device *peer_adev);
>>
>> static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
>> struct amdgpu_device *bo_adev)
More information about the amd-gfx
mailing list