[PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails
Felix Kuehling
felix.kuehling at amd.com
Thu Jun 17 21:52:12 UTC 2021
Am 2021-06-17 um 8:41 a.m. schrieb Pan, Xinhui:
> Felix
> What I am wondreing is that if CP got hang, could we assume all usermode queues have stopped?
> If so, we can do cleanupwork regardless of the retval of execute_queues_cpsch().
Right. That's what we currently do with ETIME, which happens when the
hang is first detected. I don't know why we need to treat the case
differently when we're already in a reset.
Regards,
Felix
>
>> 2021年6月17日 20:11,Pan, Xinhui <Xinhui.Pan at amd.com> 写道:
>>
>> Felix
>> what I am thinking of like below looks like more simple. :)
>>
>> @@ -1501,6 +1501,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>> /* remove queue from list to prevent rescheduling after preemption */
>> dqm_lock(dqm);
>>
>> + if (dqm->is_hws_hang) {
>> + retval = -EIO;
>> + goto failed_try_destroy_debugged_queue;
>> + }
>> +
>> if (qpd->is_debug) {
>> /*
>> * error, currently we do not allow to destroy a queue
>>
>>> 2021年6月17日 20:02,Pan, Xinhui <Xinhui.Pan at amd.com> 写道:
>>>
>>> Handle queue destroy failure while CP hang.
>>> Once CP got hang, kfd trigger GPU reset and set related flags to stop
>>> driver touching the queue. As we leave the queue as it is, we need keep
>>> the resource as it is too.
>>>
>>> Regardless user-space tries to destroy the queue again or not. We need
>>> put queue back to the list so process termination would do the cleanup
>>> work. What's more, if userspace tries to destroy the queue again, we
>>> would not free its resource twice.
>>>
>>> Kfd return -EIO in this case, so lets handle it now.
>>>
>>> Paste some error log below without this patch.
>>>
>>> amdgpu: Can't create new usermode queue because -1 queues were already
>>> created
>>>
>>> refcount_t: underflow; use-after-free.
>>> Call Trace:
>>> kobject_put+0xe6/0x1b0
>>> kfd_procfs_del_queue+0x37/0x50 [amdgpu]
>>> pqm_destroy_queue+0x17a/0x390 [amdgpu]
>>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>>> kfd_ioctl+0x463/0x690 [amdgpu]
>>>
>>> BUG kmalloc-32 (Tainted: G W ): Object already free
>>> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
>>> pid=2511
>>> __slab_alloc+0x72/0x80
>>> kmem_cache_alloc_trace+0x81f/0x8c0
>>> allocate_sdma_mqd+0x30/0xb0 [amdgpu]
>>> create_queue_cpsch+0xbf/0x470 [amdgpu]
>>> pqm_create_queue+0x28d/0x6d0 [amdgpu]
>>> kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
>>> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
>>> pid=2511
>>> kfree+0x322/0x340
>>> free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
>>> destroy_queue_cpsch+0x20c/0x330 [amdgpu]
>>> pqm_destroy_queue+0x1a3/0x390 [amdgpu]
>>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
>>> ---
>>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 13 +++++++++++++
>>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 +++-
>>> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 ++
>>> 3 files changed, 18 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index c069fa259b30..63a9a19a3987 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -1530,6 +1530,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>> if (retval == -ETIME)
>>> qpd->reset_wavefronts = true;
>>> + /* In gpu reset? We leave the queue as it is, so do NOT
>>> + * cleanup the resource.
>>> + */
>>> + else if (retval == -EIO)
>>> + goto failed_execute_queue;
>>> if (q->properties.is_gws) {
>>> dqm->gws_queue_count--;
>>> qpd->mapped_gws_queue = false;
>>> @@ -1551,6 +1556,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>>
>>> return retval;
>>>
>>> +failed_execute_queue:
>>> + /* Put queue back to the list, then we have chance to destroy it.
>>> + * FIXME: we do NOT want the queue in the runlist again.
>>> + */
>>> + list_add(&q->list, &qpd->queues_list);
>>> + qpd->queue_count++;
>>> + if (q->properties.is_active)
>>> + increment_queue_count(dqm, q->properties.type);
>>> failed_try_destroy_debugged_queue:
>>>
>>> dqm_unlock(dqm);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 09b98a83f670..984197e5929f 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>>>
>>> void kfd_procfs_del_queue(struct queue *q)
>>> {
>>> - if (!q)
>>> + if (!q || !kobject_get_unless_zero(&q->kobj))
>>> return;
>>>
>>> kobject_del(&q->kobj);
>>> kobject_put(&q->kobj);
>>> + /* paired with the get above */
>>> + kobject_put(&q->kobj);
>>> }
>>>
>>> int kfd_process_create_wq(void)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index 95a6c36cea4c..0588e552b8ec 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>>> dqm = pqn->kq->dev->dqm;
>>> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>> kernel_queue_uninit(pqn->kq, false);
>>> + pqn->kq = NULL;
>>> }
>>>
>>> if (pqn->q) {
>>> @@ -396,6 +397,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>>> kfree(pqn->q->properties.cu_mask);
>>> pqn->q->properties.cu_mask = NULL;
>>> uninit_queue(pqn->q);
>>> + pqn->q = NULL;
>>> }
>>>
>>> list_del(&pqn->process_queue_list);
>>> --
>>> 2.25.1
>>>
More information about the amd-gfx
mailing list