[PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
Kuehling, Felix
Felix.Kuehling at amd.com
Fri May 31 21:31:20 UTC 2019
On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
> SDMA queue allocation requires the dqm lock as it modify
> the global dqm members. Introduce functions to allocate/deallocate
> in locked/unlocked circumstance.
>
> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
> Signed-off-by: Oak Zeng <Oak.Zeng at amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 46 ++++++++++++++++------
> 1 file changed, 35 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ece35c7..1f707bb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q);
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> + struct queue *q);
>
> static void kfd_process_hw_exception(struct work_struct *work);
>
> @@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
> deallocate_hqd(dqm, q);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> dqm->sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> dqm->xgmi_sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> } else {
> pr_debug("q->properties.type %d is invalid\n",
> q->properties.type);
> @@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> if (dqm->sdma_bitmap == 0)
> return -ENOMEM;
> + dqm_lock(dqm);
Doesn't this cause a locking error where you try to take the same lock
twice in this call tree:
create_queue_nocpsch (takes DQM lock)
-> create_sdma_queue_nocpsch
-> allocate_sdma_queue (takes DQM lock again)
> bit = __ffs64(dqm->sdma_bitmap);
> dqm->sdma_bitmap &= ~(1ULL << bit);
> + dqm_unlock(dqm);
> q->sdma_id = bit;
> q->properties.sdma_engine_id = q->sdma_id %
> get_num_sdma_engines(dqm);
> @@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> if (dqm->xgmi_sdma_bitmap == 0)
> return -ENOMEM;
> + dqm_lock(dqm);
> bit = __ffs64(dqm->xgmi_sdma_bitmap);
> dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> + dqm_unlock(dqm);
> q->sdma_id = bit;
> /* sdma_engine_id is sdma id including
> * both PCIe-optimized SDMAs and XGMI-
> @@ -953,17 +959,35 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> return 0;
> }
>
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> + struct queue *q)
> +{
> + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> + if (q->sdma_id >= get_num_sdma_queues(dqm))
> + return;
> + dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> + if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
> + return;
> + dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> + }
> +}
> +
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q)
> {
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> if (q->sdma_id >= get_num_sdma_queues(dqm))
> return;
> + dqm_lock(dqm);
> dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> + dqm_unlock(dqm);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
> return;
> + dqm_lock(dqm);
> dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> + dqm_unlock(dqm);
> }
> }
You could minimize code duplication by defining deallocate_sdma_queue as
a simple wrapper:
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q)
{
dqm_lock(dqm);
deallocate_sdma_queue_locked(dqm, q);
dqm_unlock(dqm);
}
>
> @@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>
> retval = allocate_doorbell(qpd, q);
> if (retval)
> - goto out_deallocate_sdma_queue;
> + goto out_deallocate_sdma_queue_locked;
>
> dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> @@ -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
> mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> out_deallocate_doorbell:
> deallocate_doorbell(qpd, q);
> -out_deallocate_sdma_queue:
> - deallocate_sdma_queue(dqm, q);
> +out_deallocate_sdma_queue_locked:
> + deallocate_sdma_queue_locked(dqm, q);
>
> return retval;
> }
> @@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>
> retval = allocate_doorbell(qpd, q);
> if (retval)
> - goto out_deallocate_sdma_queue;
> + goto out_deallocate_sdma_queue_locked;
>
> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> q->properties.type)];
> @@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>
> out_deallocate_doorbell:
> deallocate_doorbell(qpd, q);
> -out_deallocate_sdma_queue:
> +out_deallocate_sdma_queue_locked:
Why are you renaming this label? You don't hold the DQM lock when you
get here.
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> deallocate_sdma_queue(dqm, q);
> @@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> dqm->sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> dqm->xgmi_sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> }
>
> list_del(&q->list);
> @@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> list_for_each_entry(q, &qpd->queues_list, list) {
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> dqm->sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> dqm->xgmi_sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> }
>
> if (q->properties.is_active)
More information about the amd-gfx
mailing list