[PATCH] amdkfd: Explicitly specify data type amdkfd_process_info in related functions

Zhu Lingshan lingshan.zhu at amd.com
Thu Nov 7 09:52:18 UTC 2024


On 11/5/2024 5:49 AM, Felix Kuehling wrote:
> On 2024-10-31 23:20, Zhu Lingshan wrote:
>> On 10/22/2024 4:01 PM, Zhu Lingshan wrote:
>>> On 10/22/2024 12:20 PM, Felix Kuehling wrote:
>>>> On 2024-10-14 23:51, Zhu Lingshan wrote:
>>>>> This commit specifies data type struct amdkfd_process_info
>>>>> rather than general void* in ralted functions.
>>>> Several interfaces in amdgpu_amdkfd.h use void * as opaque pointers, e.g. process_info, mem_obj, drm_priv. The reasons are partly historical because KFD used to be in its own kernel module. That said, there is nothing fundamentally wrong with opaque pointers when the upper layers doesn't need to see the contents in the objects returned by the lower layer.
>>> void * is workable but imperfect, IMHO it representing a compromise that could ideally be improve especially when we know exactly the data structure type.
>>> This change provides better readability, type safety, compiling checking, and avoid the castings
>> shall I make any improvements on this patch?
>>>> It makes me wonder, though, why you're singling out just process_info? Are you proposing to change more interfaces to eliminate opaque pointers?
>>> That is because I just happen to read process_info related code, I can surely improve others if any individuals of them also represents a certain data type.
>> do you want me to change all void * opaque pointer in a series or just this one for now?
> I'd prefer to keep it consistent. If we decide we don't want opqaue pointers in our API, we should clean up all the APIs in that header file. Or we decide, opqaue pointers are OK, and we leave things the way they are. I'm OK with either of those two options. I want to avoid a piece-meal solution that leaves things inconsistent.
I will try clean all opaque pointers that always point to consistent data types. I will send a series later.

Thanks
Lingshan 
>
> Regards,
>   Felix
>
>> Thanks
>> Lingshan
>>> Thanks
>>> Lingshan
>>>> Regards,
>>>>   Felix
>>>>
>>>>> kfd_process->kgd_process_info is initialized
>>>>> in init_kfd_vm() by such code:
>>>>>
>>>>> static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
>>>>>                        struct dma_fence **ef) {
>>>>>         struct amdkfd_process_info *info = NULL;
>>>>>
>>>>>         if (!*process_info) {
>>>>>                 info = kzalloc(sizeof(*info), GFP_KERNEL);
>>>>>
>>>>>         *process_info = info;
>>>>> }
>>>>>
>>>>> That means kfd_process->kgd_process_info is type
>>>>> struct amdkfd_process_info, therefore we should avoid using void*
>>>>>
>>>>> Using a specified data type other than void* can help improve
>>>>> readability. There are other benifits like: type safety,
>>>>> avoid casting and better compling chekings.
>>>>>
>>>>> Signed-off-by: Zhu Lingshan <lingshan.zhu at amd.com>> ---
>>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    | 10 +++---
>>>>>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 31 ++++++++-----------
>>>>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 +-
>>>>>  3 files changed, 19 insertions(+), 24 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>> index f9d119448442..c1346b8c9ab7 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>>>> @@ -239,8 +239,8 @@ void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj);
>>>>>  int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
>>>>>  				void **mem_obj);
>>>>>  void amdgpu_amdkfd_free_gws(struct amdgpu_device *adev, void *mem_obj);
>>>>> -int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem);
>>>>> -int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem);
>>>>> +int amdgpu_amdkfd_add_gws_to_process(struct amdkfd_process_info *pinfo, void *gws, struct kgd_mem **mem);
>>>>> +int amdgpu_amdkfd_remove_gws_from_process(struct amdkfd_process_info *pinfo, void *mem);
>>>>>  uint32_t amdgpu_amdkfd_get_fw_version(struct amdgpu_device *adev,
>>>>>  				      enum kgd_engine_type type);
>>>>>  void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev,
>>>>> @@ -299,7 +299,7 @@ int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
>>>>>  				     struct amdgpu_vm *avm, u32 pasid);
>>>>>  int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
>>>>>  					struct amdgpu_vm *avm,
>>>>> -					void **process_info,
>>>>> +					struct amdkfd_process_info **process_info,
>>>>>  					struct dma_fence **ef);
>>>>>  void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
>>>>>  					void *drm_priv);
>>>>> @@ -348,8 +348,8 @@ void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *ad
>>>>>  
>>>>>  bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
>>>>>  bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem);
>>>>> -void amdgpu_amdkfd_block_mmu_notifications(void *p);
>>>>> -int amdgpu_amdkfd_criu_resume(void *p);
>>>>> +void amdgpu_amdkfd_block_mmu_notifications(struct amdkfd_process_info *pinfo);
>>>>> +int amdgpu_amdkfd_criu_resume(struct amdkfd_process_info *pinfo);
>>>>>  int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
>>>>>  		uint64_t size, u32 alloc_flag, int8_t xcp_id);
>>>>>  void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>> index ce5ca304dba9..2a1ee17e44a1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>>> @@ -1376,7 +1376,7 @@ static int process_update_pds(struct amdkfd_process_info *process_info,
>>>>>  	return 0;
>>>>>  }
>>>>>  
>>>>> -static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
>>>>> +static int init_kfd_vm(struct amdgpu_vm *vm, struct amdkfd_process_info **process_info,
>>>>>  		       struct dma_fence **ef)
>>>>>  {
>>>>>  	struct amdkfd_process_info *info = NULL;
>>>>> @@ -1552,7 +1552,7 @@ int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
>>>>>  
>>>>>  int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
>>>>>  					   struct amdgpu_vm *avm,
>>>>> -					   void **process_info,
>>>>> +					   struct amdkfd_process_info **process_info,
>>>>>  					   struct dma_fence **ef)
>>>>>  {
>>>>>  	int ret;
>>>>> @@ -1639,19 +1639,16 @@ uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv)
>>>>>  	return avm->pd_phys_addr;
>>>>>  }
>>>>>  
>>>>> -void amdgpu_amdkfd_block_mmu_notifications(void *p)
>>>>> +void amdgpu_amdkfd_block_mmu_notifications(struct amdkfd_process_info *pinfo)
>>>>>  {
>>>>> -	struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
>>>>> -
>>>>>  	mutex_lock(&pinfo->lock);
>>>>>  	WRITE_ONCE(pinfo->block_mmu_notifications, true);
>>>>>  	mutex_unlock(&pinfo->lock);
>>>>>  }
>>>>>  
>>>>> -int amdgpu_amdkfd_criu_resume(void *p)
>>>>> +int amdgpu_amdkfd_criu_resume(struct amdkfd_process_info *pinfo)
>>>>>  {
>>>>>  	int ret = 0;
>>>>> -	struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
>>>>>  
>>>>>  	mutex_lock(&pinfo->lock);
>>>>>  	pr_debug("scheduling work\n");
>>>>> @@ -3093,13 +3090,12 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *
>>>>>  	return ret;
>>>>>  }
>>>>>  
>>>>> -int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem)
>>>>> +int amdgpu_amdkfd_add_gws_to_process(struct amdkfd_process_info *pinfo, void *gws, struct kgd_mem **mem)
>>>>>  {
>>>>> -	struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;
>>>>>  	struct amdgpu_bo *gws_bo = (struct amdgpu_bo *)gws;
>>>>>  	int ret;
>>>>>  
>>>>> -	if (!info || !gws)
>>>>> +	if (!pinfo || !gws)
>>>>>  		return -EINVAL;
>>>>>  
>>>>>  	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
>>>>> @@ -3110,8 +3106,8 @@ int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem
>>>>>  	INIT_LIST_HEAD(&(*mem)->attachments);
>>>>>  	(*mem)->bo = amdgpu_bo_ref(gws_bo);
>>>>>  	(*mem)->domain = AMDGPU_GEM_DOMAIN_GWS;
>>>>> -	(*mem)->process_info = process_info;
>>>>> -	add_kgd_mem_to_kfd_bo_list(*mem, process_info, false);
>>>>> +	(*mem)->process_info = pinfo;
>>>>> +	add_kgd_mem_to_kfd_bo_list(*mem, pinfo, false);
>>>>>  	amdgpu_sync_create(&(*mem)->sync);
>>>>>  
>>>>>  
>>>>> @@ -3136,7 +3132,7 @@ int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem
>>>>>  	if (ret)
>>>>>  		goto reserve_shared_fail;
>>>>>  	dma_resv_add_fence(gws_bo->tbo.base.resv,
>>>>> -			   &process_info->eviction_fence->base,
>>>>> +			   &pinfo->eviction_fence->base,
>>>>>  			   DMA_RESV_USAGE_BOOKKEEP);
>>>>>  	amdgpu_bo_unreserve(gws_bo);
>>>>>  	mutex_unlock(&(*mem)->process_info->lock);
>>>>> @@ -3149,7 +3145,7 @@ int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem
>>>>>  bo_reservation_failure:
>>>>>  	mutex_unlock(&(*mem)->process_info->lock);
>>>>>  	amdgpu_sync_free(&(*mem)->sync);
>>>>> -	remove_kgd_mem_from_kfd_bo_list(*mem, process_info);
>>>>> +	remove_kgd_mem_from_kfd_bo_list(*mem, pinfo);
>>>>>  	amdgpu_bo_unref(&gws_bo);
>>>>>  	mutex_destroy(&(*mem)->lock);
>>>>>  	kfree(*mem);
>>>>> @@ -3157,17 +3153,16 @@ int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem
>>>>>  	return ret;
>>>>>  }
>>>>>  
>>>>> -int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem)
>>>>> +int amdgpu_amdkfd_remove_gws_from_process(struct amdkfd_process_info *pinfo, void *mem)
>>>>>  {
>>>>>  	int ret;
>>>>> -	struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;
>>>>>  	struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
>>>>>  	struct amdgpu_bo *gws_bo = kgd_mem->bo;
>>>>>  
>>>>>  	/* Remove BO from process's validate list so restore worker won't touch
>>>>>  	 * it anymore
>>>>>  	 */
>>>>> -	remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info);
>>>>> +	remove_kgd_mem_from_kfd_bo_list(kgd_mem, pinfo);
>>>>>  
>>>>>  	ret = amdgpu_bo_reserve(gws_bo, false);
>>>>>  	if (unlikely(ret)) {
>>>>> @@ -3176,7 +3171,7 @@ int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem)
>>>>>  		return ret;
>>>>>  	}
>>>>>  	amdgpu_amdkfd_remove_eviction_fence(gws_bo,
>>>>> -			process_info->eviction_fence);
>>>>> +			pinfo->eviction_fence);
>>>>>  	amdgpu_bo_unreserve(gws_bo);
>>>>>  	amdgpu_sync_free(&kgd_mem->sync);
>>>>>  	amdgpu_bo_unref(&gws_bo);
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> index d6530febabad..b0426a1235b8 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> @@ -934,7 +934,7 @@ struct kfd_process {
>>>>>  	bool signal_event_limit_reached;
>>>>>  
>>>>>  	/* Information used for memory eviction */
>>>>> -	void *kgd_process_info;
>>>>> +	struct amdkfd_process_info *kgd_process_info;>  	/* Eviction fence that is attached to all the BOs of this process. The
>>>>>  	 * fence will be triggered during eviction and new one will be created
>>>>>  	 * during restore



More information about the amd-gfx mailing list