[PATCH] drm/amdkfd: fix missed queue reset on queue destroy

Wed Aug 21 21:50:41 UTC 2024

On 2024-08-21 17:17, Jonathan Kim wrote:
> If a queue is being destroyed but causes a HWS hang on removal, the KFD
> may issue an unnecessary gpu reset if the destroyed queue can be fixed
> by a queue reset.
>
> This is because the queue has been removed from the KFD's queue list
> prior to the preemption action on destroy so the reset call will fail to
> match the HQD PQ reset information against the KFD's queue record to do
> the actual reset.
>
> Since a queue destroy request is under the same device lock as any other
> preemption request (which subsumes queue reset calls), transiently
> store the destroyed queue's reference so that a potential subsequent queue
> reset call can check against this queue as well.

Maybe this could be simplified by disabling the queues before destroying 
it. That way the queue would still exist when it's being unmapped and 
you don't need to hack the special case "cur_destroyed_queue" into the 
queue reset code.

Regards,
   Felix

>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 +++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>   2 files changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 577d121cc6d1..09e39a72ca31 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1842,6 +1842,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   		goto fail_detect_hang_buffer;
>   	}
>   
> +	dqm->cur_destroyed_queue = NULL;
> +
>   	dqm_unlock(dqm);
>   
>   	return 0;
> @@ -2105,7 +2107,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
>   		q->properties.queue_id, q->process->pasid);
>   
>   	pdd->has_reset_queue = true;
> -	if (q->properties.is_active) {
> +	if (q->properties.is_active && dqm->cur_destroyed_queue != q) {
>   		q->properties.is_active = false;
>   		decrement_queue_count(dqm, qpd, q);
>   	}
> @@ -2160,6 +2162,10 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
>   	struct qcm_process_device *qpd;
>   	struct queue *q;
>   
> +	if (dqm->cur_destroyed_queue &&
> +	    dqm->cur_destroyed_queue->properties.queue_address == queue_address)
> +		return dqm->cur_destroyed_queue;
> +
>   	list_for_each_entry(cur, &dqm->queues, list) {
>   		qpd = cur->qpd;
>   		list_for_each_entry(q, &qpd->queues_list, list) {
> @@ -2409,6 +2415,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   
>   	list_del(&q->list);
>   	qpd->queue_count--;
> +	dqm->cur_destroyed_queue = q;
>   	if (q->properties.is_active) {
>   		decrement_queue_count(dqm, qpd, q);
>   		if (!dqm->dev->kfd->shared_resources.enable_mes) {
> @@ -2421,6 +2428,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   			retval = remove_queue_mes(dqm, q, qpd);
>   		}
>   	}
> +	dqm->cur_destroyed_queue = NULL;
>   
>   	/*
>   	 * Unconditionally decrement this counter, regardless of the queue's
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 08b40826ad1e..5425c1dd7924 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -285,6 +285,7 @@ struct device_queue_manager {
>   	struct dqm_detect_hang_info *detect_hang_info;
>   	size_t detect_hang_info_size;
>   	int detect_hang_count;
> +	struct queue *cur_destroyed_queue;
>   };
>   
>   void device_queue_manager_init_cik(