[v6 03/13] drm/amdgpu/mes12: implement detect and reset callback
Zhang, Jesse(Jie)
Jesse.Zhang at amd.com
Tue Aug 5 06:13:35 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
-----Original Message-----
From: Alex Deucher <alexdeucher at gmail.com>
Sent: Tuesday, August 5, 2025 1:01 AM
To: Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>
Cc: amd-gfx at lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
Subject: Re: [v6 03/13] drm/amdgpu/mes12: implement detect and reset callback
On Mon, Aug 4, 2025 at 4:53 AM Jesse.Zhang <Jesse.Zhang at amd.com> wrote:
>
> From: Alex Deucher <alexander.deucher at amd.com>
>
> Implement support for the hung queue detect and reset functionality.
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 37
> ++++++++++++++++++++++++++
> 1 file changed, 37 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 6b222630f3fa..29d38aa1897e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -47,6 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct
> amdgpu_device *adev);
>
> #define MES_EOP_SIZE 2048
>
> +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4
> +
> static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring) {
> struct amdgpu_device *adev = ring->adev; @@ -879,6 +881,38 @@
> static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes,
> offsetof(union MESAPI__RESET, api_status)); }
>
> +static int mes_v12_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> + struct
> +mes_detect_and_reset_queue_input *input) {
> + union MESAPI__RESET mes_reset_queue_pkt;
> + int pipe;
> +
> + memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
> +
> + mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
> + mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET;
> + mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> + mes_reset_queue_pkt.queue_type =
> + convert_to_mes_queue_type(input->queue_type);
> + mes_reset_queue_pkt.doorbell_offset_addr =
> + mes->hung_queue_db_array_gpu_addr;
> +
> + if (input->detect_only)
> + mes_reset_queue_pkt.hang_detect_only = 1;
> + else
> + mes_reset_queue_pkt.hang_detect_then_reset = 1;
> +
> + if (mes->adev->enable_uni_mes)
> + pipe = AMDGPU_MES_KIQ_PIPE;
> + else
> + pipe = AMDGPU_MES_SCHED_PIPE;
I think this should probably always be AMDGPU_MES_SCHED_PIPE. Setting this may fix the issues you were seeing on gfx12.
Thanks Alex. With this change, it fixed the issues with the GFX12 compute queue.
Thanks
Jesse
Alex
> +
> + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> + &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
> + offsetof(union MESAPI__RESET, api_status)); }
> +
> static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
> .add_hw_queue = mes_v12_0_add_hw_queue,
> .remove_hw_queue = mes_v12_0_remove_hw_queue, @@ -888,6 +922,7
> @@ static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
> .resume_gang = mes_v12_0_resume_gang,
> .misc_op = mes_v12_0_misc_op,
> .reset_hw_queue = mes_v12_0_reset_hw_queue,
> + .detect_and_reset_hung_queues =
> + mes_v12_0_detect_and_reset_hung_queues,
> };
>
> static int mes_v12_0_allocate_ucode_buffer(struct amdgpu_device
> *adev, @@ -1793,6 +1828,8 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block)
> struct amdgpu_device *adev = ip_block->adev;
> int pipe, r;
>
> + adev->mes.hung_queue_db_array_size =
> + MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
> for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
> r = amdgpu_mes_init_microcode(adev, pipe);
> if (r)
> --
> 2.49.0
>
More information about the amd-gfx
mailing list