[PATCH 1/2] drm/amdgpu: Implement get num of hops between two xgmi device

Liu, Shaoyun Shaoyun.Liu at amd.com
Tue Apr 23 18:30:05 UTC 2019


comments inline.

On 2019-04-23 2:09 p.m., Kuehling, Felix wrote:
> See inline.
>
> On 2019-04-17 2:58 p.m., Liu, Shaoyun wrote:
>> KFD need to provide the info for upper level to determine the data path
>>
>> Change-Id: Idc809e8f3381b9222dd7be96539522d440f3ee7d
>> Signed-off-by: shaoyunl <shaoyun.liu at amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 15 +++++++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 21 +++++++++++++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
>>    4 files changed, 39 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index acf8ae0..3fe9a38 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -27,6 +27,7 @@
>>    #include "amdgpu_gfx.h"
>>    #include <linux/module.h>
>>    #include <linux/dma-buf.h>
>> +#include "amdgpu_xgmi.h"
>>    
>>    static const unsigned int compute_vmid_bitmap = 0xFF00;
>>    
>> @@ -481,6 +482,20 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
>>    
>>    	return adev->gmc.xgmi.hive_id;
>>    }
>> +uint32_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src)
>> +{
>> +	struct amdgpu_device *peer_adev = (struct amdgpu_device *)src;
>> +	struct amdgpu_device *adev = (struct amdgpu_device *)dst;
>> +	int ret = amdgpu_xgmi_get_hops_count(adev, peer_adev);
>> +
>> +	if (ret < 0) {
>> +		DRM_ERROR("amdgpu: failed to get  xgmi hops count between node %d and %d. ret = %d\n",
>> +			adev->gmc.xgmi.physical_node_id,
>> +			peer_adev->gmc.xgmi.physical_node_id, ret);
>> +		ret = 0;
>> +	}
>> +	return  (uint32_t)ret;
>> +}
>>    
>>    int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
>>    				uint32_t vmid, uint64_t gpu_addr,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index e6a5037..8cc8a5a1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -154,6 +154,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
>>    				  uint32_t *flags);
>>    uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
>>    uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
>> +uint32_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *src);
>>    
>>    #define read_user_wptr(mmptr, wptr, dst)				\
>>    	({								\
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index a48c84c..eaebd47 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -248,6 +248,27 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
>>    	return ret;
>>    }
>>    
>> +
>> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
>> +		struct amdgpu_device *peer_adev)
>> +{
>> +	struct psp_xgmi_topology_info *top;
>> +	int ret = 0;
>> +
>> +	top = kzalloc(sizeof(struct psp_xgmi_topology_info), GFP_KERNEL);
> Where does this get freed? Looks like you have a memory leak.
>
> Also, instead of allocating a new copy and querying the info from the
> PSP many times, could the psp_xgmi_topology_info structure be persistent
> somewhere in adev. I think after the driver initialization it won't
> change any more.
>
> Regards,
>     Felix
>
Good catch for the memory leak. I will add free before this exit 
function.  I think of move psp_xgmi_topology_info into adev , but seems  
involves too much change . Let me re-think it .

Regards

shaoyun.liu


>    

>> +	if (!top)
>> +		return -ENOMEM;
>> +	top->num_nodes = 1;
>> +	top->nodes[0].node_id = peer_adev->gmc.xgmi.node_id;
>> +	ret = psp_xgmi_get_topology_info(&adev->psp, 1, top);
>> +	if (ret) {
>> +		dev_err(adev->dev,
>> +			"XGMI: Failed to get topology info\n");
>> +		return ret;
>> +	}
>> +	return top->nodes[0].num_hops;
>> +}
>> +
>>    int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    {
>>    	struct psp_xgmi_topology_info *hive_topology;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> index 3e9c91e..8a945bf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> @@ -41,6 +41,8 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
>>    int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
>>    void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>>    int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
>> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
>> +		struct amdgpu_device *peer_adev);
>>    
>>    static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
>>    		struct amdgpu_device *bo_adev)


More information about the amd-gfx mailing list