[v3 8/9] drm/amdgpu/userq: add a detect and reset callback

Tue Jun 17 09:56:32 UTC 2025

On 6/17/2025 2:50 PM, Jesse.Zhang wrote:
> From: Alex Deucher <alexander.deucher at amd.com>
> 
> Add a detect and reset callback and add the implementation
> for mes.  The callback will detect all hung queues of a
> particular ip type (e.g., GFX or compute or SDMA) and
> reset them.
> 
> v2: increase reset counter and set fence force completion (Jesse)
> 
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  |  3 ++
>  drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 51 ++++++++++++++++++++++
>  2 files changed, 54 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index ec040c2fd6c9..0335ff03f65f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -77,6 +77,9 @@ struct amdgpu_userq_funcs {
>  		     struct amdgpu_usermode_queue *queue);
>  	int (*map)(struct amdgpu_userq_mgr *uq_mgr,
>  		   struct amdgpu_usermode_queue *queue);
> +	int (*detect_and_reset)(struct amdgpu_device *adev,
> +				int queue_type);
> +
>  };
>  
>  /* Usermode queues for gfx */
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index d6f50b13e2ba..52d438b5dcef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -21,6 +21,7 @@
>   * OTHER DEALINGS IN THE SOFTWARE.
>   *
>   */
> +#include <drm/drm_drv.h>
>  #include "amdgpu.h"
>  #include "amdgpu_gfx.h"
>  #include "mes_userqueue.h"
> @@ -198,6 +199,55 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr,
>  	return 0;
>  }
>  
> +static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
> +				      int queue_type)
> +{
> +	int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
> +	struct mes_detect_and_reset_queue_input input;
> +	struct amdgpu_usermode_queue *queue;
> +	struct amdgpu_userq_mgr *uqm, *tmp;
> +	unsigned int hung_db_num = 0;
> +	int queue_id, r, i;
> +	u32 db_array[4];
> +
> +	if (db_array_size > 4) {
> +		dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
> +			db_array_size);
> +		return -EINVAL;
> +	}
> +
> +	memset(&input, 0x0, sizeof(struct mes_detect_and_reset_queue_input));
> +
> +	input.queue_type = queue_type;
> +
> +	amdgpu_mes_lock(&adev->mes);
> +	r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false,
> +						    &hung_db_num, db_array);
> +	amdgpu_mes_unlock(&adev->mes);
> +	if (r) {
> +		dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r);
> +	} else if (hung_db_num) {
> +		mutex_lock(&adev->userq_mutex);
> +		list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +			idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> +				if (queue->queue_type == queue_type) {
> +					for (i = 0; i < hung_db_num; i++) {
> +						if (queue->doorbell_index == db_array[i]) {
> +							queue->state = AMDGPU_USERQ_STATE_HUNG;

After a reset and force completion of work, why is the queue state
maintained as hung? Does that mean no more work can be submitted even
after reset? Where is this state checked?

Thanks,
Lijo

> +							atomic_inc(&adev->gpu_reset_counter);
> +							amdgpu_userq_fence_driver_force_completion(queue);
> +							drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
> +						}
> +					}
> +				}
> +			}
> +		}
> +		mutex_unlock(&adev->userq_mutex);
> +	}
> +
> +	return r;
> +}
> +
>  static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
>  				struct drm_amdgpu_userq_in *args_in,
>  				struct amdgpu_usermode_queue *queue)
> @@ -352,4 +402,5 @@ const struct amdgpu_userq_funcs userq_mes_funcs = {
>  	.mqd_destroy = mes_userq_mqd_destroy,
>  	.unmap = mes_userq_unmap,
>  	.map = mes_userq_map,
> +	.detect_and_reset = mes_userq_detect_and_reset,
>  };