[v7 08/11] drm/amdgpu/userq: add a detect and reset callback

Alex Deucher alexdeucher at gmail.com
Fri Aug 8 19:53:24 UTC 2025


On Tue, Aug 5, 2025 at 10:39 PM Jesse.Zhang <Jesse.Zhang at amd.com> wrote:
>
> From: Alex Deucher <alexander.deucher at amd.com>
>
> Add a detect and reset callback and add the implementation
> for mes.  The callback will detect all hung queues of a
> particular ip type (e.g., GFX or compute or SDMA) and
> reset them.
>
> v2: increase reset counter and set fence force completion
> v3: Removed userq_mutex in mes_userq_detect_and_reset since the driver holds it when calling
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  |  2 +
>  drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 49 ++++++++++++++++++++++
>  2 files changed, 51 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 5111d7dce86f..9fa0d1a88d71 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -81,6 +81,8 @@ struct amdgpu_userq_funcs {
>                    struct amdgpu_usermode_queue *queue);
>         int (*restore)(struct amdgpu_userq_mgr *uq_mgr,
>                    struct amdgpu_usermode_queue *queue);
> +       int (*detect_and_reset)(struct amdgpu_device *adev,
> +                 int queue_type);
>  };
>
>  /* Usermode queues for gfx */
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index 46b24035e14c..180bd4347bdc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -21,6 +21,7 @@
>   * OTHER DEALINGS IN THE SOFTWARE.
>   *
>   */
> +#include <drm/drm_drv.h>
>  #include "amdgpu.h"
>  #include "amdgpu_gfx.h"
>  #include "mes_userqueue.h"
> @@ -198,6 +199,53 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr,
>         return 0;
>  }
>
> +static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
> +                                     int queue_type)
> +{
> +       int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
> +       struct mes_detect_and_reset_queue_input input;
> +       struct amdgpu_usermode_queue *queue;
> +       struct amdgpu_userq_mgr *uqm, *tmp;
> +       unsigned int hung_db_num = 0;
> +       int queue_id, r, i;
> +       u32 db_array[4];
> +
> +       if (db_array_size > 4) {
> +               dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
> +                       db_array_size);
> +               return -EINVAL;
> +       }
> +
> +       memset(&input, 0x0, sizeof(struct mes_detect_and_reset_queue_input));
> +
> +       input.queue_type = queue_type;
> +
> +       amdgpu_mes_lock(&adev->mes);
> +       r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false,
> +                                                   &hung_db_num, db_array);
> +       amdgpu_mes_unlock(&adev->mes);
> +       if (r) {
> +               dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r);
> +       } else if (hung_db_num) {
> +               list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +                       idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> +                               if (queue->queue_type == queue_type) {
> +                                       for (i = 0; i < hung_db_num; i++) {
> +                                               if (queue->doorbell_index == db_array[i]) {
> +                                                       queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                                                       atomic_inc(&adev->gpu_reset_counter);
> +                                                       amdgpu_userq_fence_driver_force_completion(queue);

This function doesn't exist yet.  Move that patch up before this one.

Alex

> +                                                       drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
> +                                               }
> +                                       }
> +                               }
> +                       }
> +               }
> +       }
> +
> +       return r;
> +}
> +
>  static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
>                                 struct drm_amdgpu_userq_in *args_in,
>                                 struct amdgpu_usermode_queue *queue)
> @@ -403,4 +451,5 @@ const struct amdgpu_userq_funcs userq_mes_funcs = {
>         .map = mes_userq_map,
>         .preempt = mes_userq_preempt,
>         .restore = mes_userq_restore,
> +       .detect_and_reset = mes_userq_detect_and_reset,
>  };
> --
> 2.49.0
>


More information about the amd-gfx mailing list