[PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails

Thu Jun 17 21:52:12 UTC 2021

Am 2021-06-17 um 8:41 a.m. schrieb Pan, Xinhui:
> Felix
> What I am wondreing is that if CP got hang,  could we assume all usermode queues have stopped?
> If so, we can do cleanupwork regardless of the retval of execute_queues_cpsch().

Right. That's what we currently do with ETIME, which happens when the
hang is first detected. I don't know why we need to treat the case
differently when we're already in a reset.

Regards,
  Felix

>
>> 2021年6月17日 20:11，Pan, Xinhui <Xinhui.Pan at amd.com> 写道：
>>
>> Felix
>> what I am thinking of like below looks like more simple. :)
>>
>> @@ -1501,6 +1501,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>        /* remove queue from list to prevent rescheduling after preemption */
>>        dqm_lock(dqm);
>>
>> +       if (dqm->is_hws_hang) {
>> +               retval = -EIO;
>> +               goto failed_try_destroy_debugged_queue;
>> +       }
>> +
>>        if (qpd->is_debug) {
>>                /*
>>                 * error, currently we do not allow to destroy a queue
>>
>>> 2021年6月17日 20:02，Pan, Xinhui <Xinhui.Pan at amd.com> 写道：
>>>
>>> Handle queue destroy failure while CP hang.
>>> Once CP got hang, kfd trigger GPU reset and set related flags to stop
>>> driver touching the queue. As we leave the queue as it is, we need keep
>>> the resource as it is too.
>>>
>>> Regardless user-space tries to destroy the queue again or not. We need
>>> put queue back to the list so process termination would do the cleanup
>>> work. What's more, if userspace tries to destroy the queue again, we
>>> would not free its resource twice.
>>>
>>> Kfd return -EIO in this case, so lets handle it now.
>>>
>>> Paste some error log below without this patch.
>>>
>>> amdgpu: Can't create new usermode queue because -1 queues were already
>>> created
>>>
>>> refcount_t: underflow; use-after-free.
>>> Call Trace:
>>> kobject_put+0xe6/0x1b0
>>> kfd_procfs_del_queue+0x37/0x50 [amdgpu]
>>> pqm_destroy_queue+0x17a/0x390 [amdgpu]
>>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>>> kfd_ioctl+0x463/0x690 [amdgpu]
>>>
>>> BUG kmalloc-32 (Tainted: G        W        ): Object already free
>>> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
>>> pid=2511
>>> __slab_alloc+0x72/0x80
>>> kmem_cache_alloc_trace+0x81f/0x8c0
>>> allocate_sdma_mqd+0x30/0xb0 [amdgpu]
>>> create_queue_cpsch+0xbf/0x470 [amdgpu]
>>> pqm_create_queue+0x28d/0x6d0 [amdgpu]
>>> kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
>>> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
>>> pid=2511
>>> kfree+0x322/0x340
>>> free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
>>> destroy_queue_cpsch+0x20c/0x330 [amdgpu]
>>> pqm_destroy_queue+0x1a3/0x390 [amdgpu]
>>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
>>> ---
>>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 +++++++++++++
>>> drivers/gpu/drm/amd/amdkfd/kfd_process.c            |  4 +++-
>>> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c  |  2 ++
>>> 3 files changed, 18 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index c069fa259b30..63a9a19a3987 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -1530,6 +1530,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>> 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>> 		if (retval == -ETIME)
>>> 			qpd->reset_wavefronts = true;
>>> +		/* In gpu reset? We leave the queue as it is, so do NOT
>>> +		 * cleanup the resource.
>>> +		 */
>>> +		else if (retval == -EIO)
>>> +			goto failed_execute_queue;
>>> 		if (q->properties.is_gws) {
>>> 			dqm->gws_queue_count--;
>>> 			qpd->mapped_gws_queue = false;
>>> @@ -1551,6 +1556,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>>
>>> 	return retval;
>>>
>>> +failed_execute_queue:
>>> +	/* Put queue back to the list, then we have chance to destroy it.
>>> +	 * FIXME: we do NOT want the queue in the runlist again.
>>> +	 */
>>> +	list_add(&q->list, &qpd->queues_list);
>>> +	qpd->queue_count++;
>>> +	if (q->properties.is_active)
>>> +		increment_queue_count(dqm, q->properties.type);
>>> failed_try_destroy_debugged_queue:
>>>
>>> 	dqm_unlock(dqm);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 09b98a83f670..984197e5929f 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>>>
>>> void kfd_procfs_del_queue(struct queue *q)
>>> {
>>> -	if (!q)
>>> +	if (!q || !kobject_get_unless_zero(&q->kobj))
>>> 		return;
>>>
>>> 	kobject_del(&q->kobj);
>>> 	kobject_put(&q->kobj);
>>> +	/* paired with the get above */
>>> +	kobject_put(&q->kobj);
>>> }
>>>
>>> int kfd_process_create_wq(void)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index 95a6c36cea4c..0588e552b8ec 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>>> 		dqm = pqn->kq->dev->dqm;
>>> 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>> 		kernel_queue_uninit(pqn->kq, false);
>>> +		pqn->kq = NULL;
>>> 	}
>>>
>>> 	if (pqn->q) {
>>> @@ -396,6 +397,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>>> 		kfree(pqn->q->properties.cu_mask);
>>> 		pqn->q->properties.cu_mask = NULL;
>>> 		uninit_queue(pqn->q);
>>> +		pqn->q = NULL;
>>> 	}
>>>
>>> 	list_del(&pqn->process_queue_list);
>>> -- 
>>> 2.25.1
>>>