[PATCH 2/3] drm/amdkfd: avoid HMM change cause circular lock dependency v2
Christian König
ckoenig.leichtzumerken at gmail.com
Tue Feb 5 11:42:51 UTC 2019
Am 04.02.19 um 19:23 schrieb Yang, Philip:
> There is circular lock between gfx and kfd path with HMM change:
> lock(dqm) -> bo::reserve -> amdgpu_mn_lock
>
> To avoid this, move init/unint_mqd() out of lock(dqm), to remove nested
> locking between mmap_sem and bo::reserve. The locking order
> is: bo::reserve -> amdgpu_mn_lock(p->mn)
In general this sounds correct to me, but apart from that I don't know
the code well enough to fully judge.
>
> Change-Id: I2ec09a47571f6b4c8eaef93f22c0a600f5f70153
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
Acked-by: Christian König <christian.koenig at amd.com>
> ---
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 32 ++++++++++---------
> 1 file changed, 17 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 8372556b52eb..efe0d3c0215b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1156,21 +1156,17 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> int retval;
> struct mqd_manager *mqd_mgr;
>
> - retval = 0;
> -
> - dqm_lock(dqm);
> -
> if (dqm->total_queue_count >= max_num_of_queues_per_device) {
> pr_warn("Can't create new usermode queue because %d queues were already created\n",
> dqm->total_queue_count);
> retval = -EPERM;
> - goto out_unlock;
> + goto out;
> }
>
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> retval = allocate_sdma_queue(dqm, &q->sdma_id);
> if (retval)
> - goto out_unlock;
> + goto out;
> q->properties.sdma_queue_id =
> q->sdma_id / get_num_sdma_engines(dqm);
> q->properties.sdma_engine_id =
> @@ -1181,6 +1177,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> if (retval)
> goto out_deallocate_sdma_queue;
>
> + /* Do init_mqd before dqm_lock(dqm) to avoid circular locking order:
> + * lock(dqm) -> bo::reserve
> + */
> mqd_mgr = dqm->ops.get_mqd_manager(dqm,
> get_mqd_type_from_queue_type(q->properties.type));
>
> @@ -1188,6 +1187,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> retval = -ENOMEM;
> goto out_deallocate_doorbell;
> }
> +
> /*
> * Eviction state logic: we only mark active queues as evicted
> * to avoid the overhead of restoring inactive queues later
> @@ -1196,9 +1196,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> q->properties.is_evicted = (q->properties.queue_size > 0 &&
> q->properties.queue_percent > 0 &&
> q->properties.queue_address != 0);
> -
> dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> -
> q->properties.tba_addr = qpd->tba_addr;
> q->properties.tma_addr = qpd->tma_addr;
> retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> @@ -1206,6 +1204,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> if (retval)
> goto out_deallocate_doorbell;
>
> + dqm_lock(dqm);
> +
> list_add(&q->list, &qpd->queues_list);
> qpd->queue_count++;
> if (q->properties.is_active) {
> @@ -1233,9 +1233,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> out_deallocate_sdma_queue:
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> deallocate_sdma_queue(dqm, q->sdma_id);
> -out_unlock:
> - dqm_unlock(dqm);
> -
> +out:
> return retval;
> }
>
> @@ -1398,8 +1396,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
> qpd->reset_wavefronts = true;
> }
>
> - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> -
> /*
> * Unconditionally decrement this counter, regardless of the queue's
> * type
> @@ -1410,6 +1406,9 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>
> dqm_unlock(dqm);
>
> + /* Do uninit_mqd after dqm_unlock(dqm) to avoid circular locking */
> + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +
> return retval;
>
> failed:
> @@ -1631,7 +1630,11 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> qpd->reset_wavefronts = false;
> }
>
> - /* lastly, free mqd resources */
> + dqm_unlock(dqm);
> +
> + /* Lastly, free mqd resources.
> + * Do uninit_mqd() after dqm_unlock to avoid circular locking.
> + */
> list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
> mqd_mgr = dqm->ops.get_mqd_manager(dqm,
> get_mqd_type_from_queue_type(q->properties.type));
> @@ -1645,7 +1648,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> }
>
> out:
> - dqm_unlock(dqm);
> return retval;
> }
>
More information about the amd-gfx
mailing list