[PATCH v3] drm/amdkfd: Fix some double free when destroy queue fails

Felix Kuehling felix.kuehling at amd.com
Fri Jun 18 21:02:15 UTC 2021


Am 2021-06-18 um 2:02 a.m. schrieb xinhui pan:
> Handle queue destroy failure while CP hang.
> Once CP got hang, kfd trigger GPU reset and set related flags to stop
> driver touching the queue. But all usermode queues have stopped, we
> actually somehow succeed to remove queue from runlist. So lets do
> cleanup work as normal in -EIO/-ETIME case.
>
> For other fatal error cases, say ENOMEM, we have to skip cleanup this
> time as queue might still be running. Regardless user-space tries to
> destroy the queue again or not. We need put queue back to the list so
> process termination would do the cleanup work. What's more, if userspace
> tries to destroy the queue again, we would not free its resource twice.
>
> Paste some error log below without this patch.
>
> amdgpu: Can't create new usermode queue because -1 queues were already
> created
>
> refcount_t: underflow; use-after-free.
> Call Trace:
>  kobject_put+0xe6/0x1b0
>  kfd_procfs_del_queue+0x37/0x50 [amdgpu]
>  pqm_destroy_queue+0x17a/0x390 [amdgpu]
>  kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>  kfd_ioctl+0x463/0x690 [amdgpu]
>
> BUG kmalloc-32 (Tainted: G        W        ): Object already free
> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
>  __slab_alloc+0x72/0x80
>  kmem_cache_alloc_trace+0x81f/0x8c0
>  allocate_sdma_mqd+0x30/0xb0 [amdgpu]
>  create_queue_cpsch+0xbf/0x470 [amdgpu]
>  pqm_create_queue+0x28d/0x6d0 [amdgpu]
>  kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
>  kfree+0x322/0x340
>  free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
>  destroy_queue_cpsch+0x20c/0x330 [amdgpu]
>  pqm_destroy_queue+0x1a3/0x390 [amdgpu]
>  kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>
> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
> ---
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 15 ++++++++++++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c          |  4 +++-
>  .../drm/amd/amdkfd/kfd_process_queue_manager.c    |  4 +++-
>  3 files changed, 20 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 16a1713808c2..f38f011e6f97 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1528,8 +1528,13 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>  		decrement_queue_count(dqm, q->properties.type);
>  		retval = execute_queues_cpsch(dqm,
>  				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> -		if (retval == -ETIME)
> +		/* CP hang and all usermode queues have stopped.
> +		 * We are safe to do the cleanup work.
> +		 */
> +		if (retval == -ETIME || retval == -EIO)
>  			qpd->reset_wavefronts = true;
> +		else if (retval)
> +			goto failed_execute_queue;
>  		if (q->properties.is_gws) {
>  			dqm->gws_queue_count--;
>  			qpd->mapped_gws_queue = false;
> @@ -1551,6 +1556,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>  
>  	return retval;
>  
> +failed_execute_queue:
> +	/* Put queue back to the list, then we have chance to destroy it.
> +	 * FIXME: we do NOT want the queue in the runlist again.
> +	 */
> +	list_add(&q->list, &qpd->queues_list);
> +	qpd->queue_count++;
> +	if (q->properties.is_active)
> +		increment_queue_count(dqm, q->properties.type);

You should be able to keep the queue off the runlist by disabling it.
You can do that e.g. by setting q->properties.queue_address = NULL and
q->proterties.is_active = false. You'd have to update
dqm->gws_queue_count and qpd->mapped_gws_queues manually, or move that
change before the execute_queues_cpsch above.

Regards,
  Felix


>  failed_try_destroy_debugged_queue:
>  
>  	dqm_unlock(dqm);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 09b98a83f670..984197e5929f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>  
>  void kfd_procfs_del_queue(struct queue *q)
>  {
> -	if (!q)
> +	if (!q || !kobject_get_unless_zero(&q->kobj))
>  		return;
>  
>  	kobject_del(&q->kobj);
>  	kobject_put(&q->kobj);
> +	/* paired with the get above */
> +	kobject_put(&q->kobj);
>  }
>  
>  int kfd_process_create_wq(void)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 95a6c36cea4c..c796c7601365 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>  		dqm = pqn->kq->dev->dqm;
>  		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>  		kernel_queue_uninit(pqn->kq, false);
> +		pqn->kq = NULL;
>  	}
>  
>  	if (pqn->q) {
> @@ -383,7 +384,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>  			pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n",
>  				pqm->process->pasid,
>  				pqn->q->properties.queue_id, retval);
> -			if (retval != -ETIME)
> +			if (retval != -ETIME && retval != -EIO)
>  				goto err_destroy_queue;
>  		}
>  
> @@ -396,6 +397,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>  		kfree(pqn->q->properties.cu_mask);
>  		pqn->q->properties.cu_mask = NULL;
>  		uninit_queue(pqn->q);
> +		pqn->q = NULL;
>  	}
>  
>  	list_del(&pqn->process_queue_list);


More information about the amd-gfx mailing list