[PATCH v2] drm/amdkfd: Free gang_ctx_bo and wptr_bo in pqm_uninit

Felix Kuehling felix.kuehling at amd.com
Wed Nov 22 19:12:35 UTC 2023


On 2023-11-20 02:17, ZhenGuo Yin wrote:
> [Why]
> Memory leaks of gang_ctx_bo and wptr_bo.
>
> [How]
> Free gang_ctx_bo and wptr_bo in pqm_uninit.
>
> v2: add a common function pqm_clean_queue_resource to
> free queue's resources.
>
> Signed-off-by: ZhenGuo Yin <zhenguo.yin at amd.com>
> ---
>   .../amd/amdkfd/kfd_process_queue_manager.c    | 46 ++++++++++---------
>   1 file changed, 25 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 77649392e233..b027485fbe66 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -169,16 +169,34 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p)
>   	return 0;
>   }
>   
> +static void pqm_clean_queue_resource(struct process_queue_manager *pqm,
> +				     struct process_queue_node *pqn)
> +{
> +	struct kfd_node *dev;
> +
> +	dev = pqn->q->device;
> +	if (pqn->q->gws) {
> +		if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> +		    !dev->kfd->shared_resources.enable_mes)
> +			amdgpu_amdkfd_remove_gws_from_process(
> +				pqm->process->kgd_process_info, pqn->q->gws);

I think it would make sense to also set pdd->qpd.num_gws = 0 here.


> +	}
> +
> +	if (dev->kfd->shared_resources.enable_mes) {
> +		amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->gang_ctx_bo);
> +		if (pqn->q->wptr_bo)
> +			amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo);
> +	}
> +}
> +
>   void pqm_uninit(struct process_queue_manager *pqm)
>   {
>   	struct process_queue_node *pqn, *next;
>   
>   	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
> -		if (pqn->q && pqn->q->gws &&
> -		    KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> -		    !pqn->q->device->kfd->shared_resources.enable_mes)
> -			amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
> -				pqn->q->gws);
> +		if (pqn->q)
> +			pqm_clean_queue_resource(pqm, pqn);
> +
>   		kfd_procfs_del_queue(pqn->q);
>   		uninit_queue(pqn->q);
>   		list_del(&pqn->process_queue_list);
> @@ -461,22 +479,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   				goto err_destroy_queue;
>   		}
>   
> -		if (pqn->q->gws) {
> -			if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> -			    !dev->kfd->shared_resources.enable_mes)
> -				amdgpu_amdkfd_remove_gws_from_process(
> -						pqm->process->kgd_process_info,
> -						pqn->q->gws);
> -			pdd->qpd.num_gws = 0;
> -		}
> -
> -		if (dev->kfd->shared_resources.enable_mes) {
> -			amdgpu_amdkfd_free_gtt_mem(dev->adev,
> -						   pqn->q->gang_ctx_bo);
> -			if (pqn->q->wptr_bo)
> -				amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo);
> -
> -		}
> +		pqm_clean_queue_resource(pqm, pqn);
> +		pdd->qpd.num_gws = 0;

This is incorrect. We should only set pdd->qpd.num_gws = 0 if we are 
destroying the GWS queue. That's why it's better to do this above, so 
you don't need to duplicate these conditions.

Regards,
   Felix


>   		uninit_queue(pqn->q);
>   	}
>   


More information about the amd-gfx mailing list