[PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
Zeng, Oak
Oak.Zeng at amd.com
Mon Jun 3 16:05:23 UTC 2019
Hi Felix,
See comment inline [Oak]
Regards,
Oak
-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling at amd.com>
Sent: Friday, May 31, 2019 5:50 PM
To: Zeng, Oak <Oak.Zeng at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhao, Yong <Yong.Zhao at amd.com>; Freehill, Chris <Chris.Freehill at amd.com>; Liu, Alex <Alex.Liu at amd.com>
Subject: Re: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
On 2019-05-31 5:31 p.m., Kuehling, Felix wrote:
> On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
>> SDMA queue allocation requires the dqm lock as it modify the global
>> dqm members. Introduce functions to allocate/deallocate in
>> locked/unlocked circumstance.
>>
>> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
>> Signed-off-by: Oak Zeng <Oak.Zeng at amd.com>
>> ---
>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 46 ++++++++++++++++------
>> 1 file changed, 35 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index ece35c7..1f707bb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct
>> device_queue_manager *dqm,
>>
>> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>> struct queue *q);
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> + struct queue *q);
>>
>> static void kfd_process_hw_exception(struct work_struct *work);
>>
>> @@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>> deallocate_hqd(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> dqm->sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> dqm->xgmi_sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else {
>> pr_debug("q->properties.type %d is invalid\n",
>> q->properties.type);
>> @@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> if (dqm->sdma_bitmap == 0)
>> return -ENOMEM;
>> + dqm_lock(dqm);
> Doesn't this cause a locking error where you try to take the same lock
> twice in this call tree:
>
> create_queue_nocpsch (takes DQM lock)
> -> create_sdma_queue_nocpsch
> -> allocate_sdma_queue (takes DQM lock again)
>
BTW, I think you actually caught a bug here. In the create_queue_cpsch path we failed to lock the DQM for SDMA queue allocation. In create_queue_nocpsch it was not a problem because we took the DQM lock earlier (which is itself a problem you're working on fixing).
However, now you're making the problem worse for the nocpsch case, at least until patch 4, which moves the DQM locking in create_queue_nocpsch. Maybe this change should come after patch 4 so you don't end up with a worse problem in the middle of your patch series.
[Oak]: I realized the same issue when I wrote this patch...I will have to create function allocate_sdma_queue_locked/unlocked to fix the issue properly. But I will have to delete allocate_sdma_queue_unlocked after patch 4 because it won't be used any more after patch 4.
It is a good idea to do this patch after patch 4. I will re-submit the patches series, also fixing other issues you mentioned.
Also, I think you're doing the locking unnecessarily fine grained. It's probably enough to take the DQM lock once at the start of allocate_sdma_queue, and drop it once in the end. No need to duplicate this in the two if branches.
[Oak] Agreed.
Regards,
Felix
>> bit = __ffs64(dqm->sdma_bitmap);
>> dqm->sdma_bitmap &= ~(1ULL << bit);
>> + dqm_unlock(dqm);
>> q->sdma_id = bit;
>> q->properties.sdma_engine_id = q->sdma_id %
>> get_num_sdma_engines(dqm);
>> @@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> if (dqm->xgmi_sdma_bitmap == 0)
>> return -ENOMEM;
>> + dqm_lock(dqm);
>> bit = __ffs64(dqm->xgmi_sdma_bitmap);
>> dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>> + dqm_unlock(dqm);
>> q->sdma_id = bit;
>> /* sdma_engine_id is sdma id including
>> * both PCIe-optimized SDMAs and XGMI- @@ -953,17 +959,35 @@
>> static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> return 0;
>> }
>>
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> + struct queue *q)
>> +{
>> + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> + if (q->sdma_id >= get_num_sdma_queues(dqm))
>> + return;
>> + dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> + if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>> + return;
>> + dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> + }
>> +}
>> +
>> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>> struct queue *q)
>> {
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> if (q->sdma_id >= get_num_sdma_queues(dqm))
>> return;
>> + dqm_lock(dqm);
>> dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> + dqm_unlock(dqm);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>> return;
>> + dqm_lock(dqm);
>> dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> + dqm_unlock(dqm);
>> }
>> }
> You could minimize code duplication by defining deallocate_sdma_queue
> as a simple wrapper:
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q) {
> dqm_lock(dqm);
> deallocate_sdma_queue_locked(dqm, q);
> dqm_unlock(dqm);
> }
>
>
>>
>> @@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct
>> device_queue_manager *dqm,
>>
>> retval = allocate_doorbell(qpd, q);
>> if (retval)
>> - goto out_deallocate_sdma_queue;
>> + goto out_deallocate_sdma_queue_locked;
>>
>> dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>> retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, @@
>> -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>> mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>> out_deallocate_doorbell:
>> deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> - deallocate_sdma_queue(dqm, q);
>> +out_deallocate_sdma_queue_locked:
>> + deallocate_sdma_queue_locked(dqm, q);
>>
>> return retval;
>> }
>> @@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct
>> device_queue_manager *dqm, struct queue *q,
>>
>> retval = allocate_doorbell(qpd, q);
>> if (retval)
>> - goto out_deallocate_sdma_queue;
>> + goto out_deallocate_sdma_queue_locked;
>>
>> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>> q->properties.type)];
>> @@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct
>> device_queue_manager *dqm, struct queue *q,
>>
>> out_deallocate_doorbell:
>> deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> +out_deallocate_sdma_queue_locked:
> Why are you renaming this label? You don't hold the DQM lock when you
> get here.
>
>
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>> q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>> deallocate_sdma_queue(dqm, q);
>> @@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct
>> device_queue_manager *dqm,
>>
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> dqm->sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> dqm->xgmi_sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> }
>>
>> list_del(&q->list);
>> @@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>> list_for_each_entry(q, &qpd->queues_list, list) {
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> dqm->sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> dqm->xgmi_sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> }
>>
>> if (q->properties.is_active)
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list