[PATCH v2] drm/amdkfd: Free gang_ctx_bo and wptr_bo in pqm_uninit
Felix Kuehling
felix.kuehling at amd.com
Wed Nov 22 19:12:35 UTC 2023
On 2023-11-20 02:17, ZhenGuo Yin wrote:
> [Why]
> Memory leaks of gang_ctx_bo and wptr_bo.
>
> [How]
> Free gang_ctx_bo and wptr_bo in pqm_uninit.
>
> v2: add a common function pqm_clean_queue_resource to
> free queue's resources.
>
> Signed-off-by: ZhenGuo Yin <zhenguo.yin at amd.com>
> ---
> .../amd/amdkfd/kfd_process_queue_manager.c | 46 ++++++++++---------
> 1 file changed, 25 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 77649392e233..b027485fbe66 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -169,16 +169,34 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p)
> return 0;
> }
>
> +static void pqm_clean_queue_resource(struct process_queue_manager *pqm,
> + struct process_queue_node *pqn)
> +{
> + struct kfd_node *dev;
> +
> + dev = pqn->q->device;
> + if (pqn->q->gws) {
> + if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> + !dev->kfd->shared_resources.enable_mes)
> + amdgpu_amdkfd_remove_gws_from_process(
> + pqm->process->kgd_process_info, pqn->q->gws);
I think it would make sense to also set pdd->qpd.num_gws = 0 here.
> + }
> +
> + if (dev->kfd->shared_resources.enable_mes) {
> + amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->gang_ctx_bo);
> + if (pqn->q->wptr_bo)
> + amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo);
> + }
> +}
> +
> void pqm_uninit(struct process_queue_manager *pqm)
> {
> struct process_queue_node *pqn, *next;
>
> list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
> - if (pqn->q && pqn->q->gws &&
> - KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> - !pqn->q->device->kfd->shared_resources.enable_mes)
> - amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
> - pqn->q->gws);
> + if (pqn->q)
> + pqm_clean_queue_resource(pqm, pqn);
> +
> kfd_procfs_del_queue(pqn->q);
> uninit_queue(pqn->q);
> list_del(&pqn->process_queue_list);
> @@ -461,22 +479,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
> goto err_destroy_queue;
> }
>
> - if (pqn->q->gws) {
> - if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
> - !dev->kfd->shared_resources.enable_mes)
> - amdgpu_amdkfd_remove_gws_from_process(
> - pqm->process->kgd_process_info,
> - pqn->q->gws);
> - pdd->qpd.num_gws = 0;
> - }
> -
> - if (dev->kfd->shared_resources.enable_mes) {
> - amdgpu_amdkfd_free_gtt_mem(dev->adev,
> - pqn->q->gang_ctx_bo);
> - if (pqn->q->wptr_bo)
> - amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo);
> -
> - }
> + pqm_clean_queue_resource(pqm, pqn);
> + pdd->qpd.num_gws = 0;
This is incorrect. We should only set pdd->qpd.num_gws = 0 if we are
destroying the GWS queue. That's why it's better to do this above, so
you don't need to duplicate these conditions.
Regards,
Felix
> uninit_queue(pqn->q);
> }
>
More information about the amd-gfx
mailing list