[v6 03/13] drm/amdgpu/mes12: implement detect and reset callback

Tue Aug 5 06:13:35 UTC 2025

[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Alex Deucher <alexdeucher at gmail.com>
Sent: Tuesday, August 5, 2025 1:01 AM
To: Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>
Cc: amd-gfx at lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
Subject: Re: [v6 03/13] drm/amdgpu/mes12: implement detect and reset callback

On Mon, Aug 4, 2025 at 4:53 AM Jesse.Zhang <Jesse.Zhang at amd.com> wrote:
>
> From: Alex Deucher <alexander.deucher at amd.com>
>
> Implement support for the hung queue detect and reset functionality.
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 37
> ++++++++++++++++++++++++++
>  1 file changed, 37 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 6b222630f3fa..29d38aa1897e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -47,6 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct
> amdgpu_device *adev);
>
>  #define MES_EOP_SIZE   2048
>
> +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4
> +
>  static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring)  {
>         struct amdgpu_device *adev = ring->adev; @@ -879,6 +881,38 @@
> static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes,
>                         offsetof(union MESAPI__RESET, api_status));  }
>
> +static int mes_v12_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> +                                                 struct
> +mes_detect_and_reset_queue_input *input) {
> +       union MESAPI__RESET mes_reset_queue_pkt;
> +       int pipe;
> +
> +       memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
> +
> +       mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
> +       mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET;
> +       mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> +       mes_reset_queue_pkt.queue_type =
> +               convert_to_mes_queue_type(input->queue_type);
> +       mes_reset_queue_pkt.doorbell_offset_addr =
> +               mes->hung_queue_db_array_gpu_addr;
> +
> +       if (input->detect_only)
> +               mes_reset_queue_pkt.hang_detect_only = 1;
> +       else
> +               mes_reset_queue_pkt.hang_detect_then_reset = 1;
> +
> +       if (mes->adev->enable_uni_mes)
> +               pipe = AMDGPU_MES_KIQ_PIPE;
> +       else
> +               pipe = AMDGPU_MES_SCHED_PIPE;

I think this should probably always be AMDGPU_MES_SCHED_PIPE.  Setting this may fix the issues you were seeing on gfx12.

Thanks Alex.  With this change, it fixed the issues with the GFX12 compute queue.

Thanks
Jesse

Alex

> +
> +       return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> +                       &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
> +                       offsetof(union MESAPI__RESET, api_status)); }
> +
>  static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
>         .add_hw_queue = mes_v12_0_add_hw_queue,
>         .remove_hw_queue = mes_v12_0_remove_hw_queue, @@ -888,6 +922,7
> @@ static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
>         .resume_gang = mes_v12_0_resume_gang,
>         .misc_op = mes_v12_0_misc_op,
>         .reset_hw_queue = mes_v12_0_reset_hw_queue,
> +       .detect_and_reset_hung_queues =
> + mes_v12_0_detect_and_reset_hung_queues,
>  };
>
>  static int mes_v12_0_allocate_ucode_buffer(struct amdgpu_device
> *adev, @@ -1793,6 +1828,8 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block)
>         struct amdgpu_device *adev = ip_block->adev;
>         int pipe, r;
>
> +       adev->mes.hung_queue_db_array_size =
> +               MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
>         for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
>                 r = amdgpu_mes_init_microcode(adev, pipe);
>                 if (r)
> --
> 2.49.0
>