[PATCH 6/6] drm/amdkfd: Fix sdma queue allocate race condition

Kuehling, Felix Felix.Kuehling at amd.com
Wed Jun 5 22:24:42 UTC 2019


I think the simpler way to fix this, is to restructure 
create_queue_cpsch similar to the nocpsch version where we allocate the 
MQD early and take the DQM lock right after that. That way you don't 
need locked and unlocked variants of allocate_sdma_queue and 
deallocate_sdma_queue.

Regards,
   Felix


On 2019-06-05 12:06 p.m., Zeng, Oak wrote:
> SDMA queue allocation requires the dqm lock as it modify
> the global dqm members. Introduce functions to allocate/deallocate
> in locked/unlocked circumstance.
>
> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
> Signed-off-by: Oak Zeng <Oak.Zeng at amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 34 ++++++++++++++++------
>   1 file changed, 25 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 6b1a2ee..52e4ede 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -53,6 +53,8 @@ static int map_queues_cpsch(struct device_queue_manager *dqm);
>   
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   				struct queue *q);
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> +				struct queue *q);
>   
>   static inline void deallocate_hqd(struct device_queue_manager *dqm,
>   				struct queue *q);
> @@ -434,10 +436,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>   		deallocate_hqd(dqm, q);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		dqm->sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm->xgmi_sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	} else {
>   		pr_debug("q->properties.type %d is invalid\n",
>   				q->properties.type);
> @@ -914,9 +916,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   {
>   	int bit;
>   
> +	dqm_lock(dqm);
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> -		if (dqm->sdma_bitmap == 0)
> +		if (dqm->sdma_bitmap == 0) {
> +			dqm_unlock(dqm);
>   			return -ENOMEM;
> +		}
>   		bit = __ffs64(dqm->sdma_bitmap);
>   		dqm->sdma_bitmap &= ~(1ULL << bit);
>   		q->sdma_id = bit;
> @@ -925,8 +930,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   		q->properties.sdma_queue_id = q->sdma_id /
>   				get_num_sdma_engines(dqm);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		if (dqm->xgmi_sdma_bitmap == 0)
> +		if (dqm->xgmi_sdma_bitmap == 0) {
> +			dqm_unlock(dqm);
>   			return -ENOMEM;
> +		}
>   		bit = __ffs64(dqm->xgmi_sdma_bitmap);
>   		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>   		q->sdma_id = bit;
> @@ -942,13 +949,14 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   				get_num_xgmi_sdma_engines(dqm);
>   	}
>   
> +	dqm_unlock(dqm);
>   	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
>   	pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
>   
>   	return 0;
>   }
>   
> -static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>   				struct queue *q)
>   {
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> @@ -962,6 +970,14 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   	}
>   }
>   
> +static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct queue *q)
> +{
> +	dqm_lock(dqm);
> +	deallocate_sdma_queue_locked(dqm, q);
> +	dqm_unlock(dqm);
> +}
> +
>   /*
>    * Device Queue Manager implementation for cp scheduler
>    */
> @@ -1356,10 +1372,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		dqm->sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm->xgmi_sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	}
>   
>   	list_del(&q->list);
> @@ -1585,10 +1601,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	list_for_each_entry(q, &qpd->queues_list, list) {
>   		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   			dqm->sdma_queue_count--;
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue_locked(dqm, q);
>   		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   			dqm->xgmi_sdma_queue_count--;
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue_locked(dqm, q);
>   		}
>   
>   		if (q->properties.is_active)


More information about the amd-gfx mailing list