[PATCH] drm/amdgpu/mes: fix mes ring buffer overflow

Christian König ckoenig.leichtzumerken at gmail.com
Mon Jul 22 08:20:45 UTC 2024


Thx, but in that case this patch here then won't help either it just 
mitigates the problem.

Can you try to reduce num_hw_submission for the MES ring?

Thanks,
Christian.

Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> />> I think we rather need to increase the MES ring size instead./
>
> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>
> Regards,
> Jack
>
> ------------------------------------------------------------------------
> *From:* Christian König <ckoenig.leichtzumerken at gmail.com>
> *Sent:* Friday, 19 July 2024 23:44
> *To:* Xiao, Jack <Jack.Xiao at amd.com>; amd-gfx at lists.freedesktop.org 
> <amd-gfx at lists.freedesktop.org>; Deucher, Alexander 
> <Alexander.Deucher at amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
> Am 19.07.24 um 11:16 schrieb Jack Xiao:
> > wait memory room until enough before writing mes packets
> > to avoid ring buffer overflow.
> >
> > Signed-off-by: Jack Xiao <Jack.Xiao at amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
> >   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
> >   2 files changed, 28 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 8ce51b9236c1..68c74adf79f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -168,7 +168,7 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -196,6 +196,13 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - 
> ring->fence_drv.num_fences_mask,
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + 
> api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -208,8 +215,7 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -229,7 +235,7 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, 
> timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -252,6 +258,10 @@ static int 
> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index c9f74231ad59..48e01206bcc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -154,7 +154,7 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        const char *op_str, *misc_op_str;
> >        unsigned long flags;
> >        u64 status_gpu_addr;
> > -     u32 status_offset;
> > +     u32 seq, status_offset;
> >        u64 *status_ptr;
> >        signed long r;
> >        int ret;
> > @@ -182,6 +182,13 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        if (r)
> >                goto error_unlock_free;
> >
> > +     seq = ++ring->fence_drv.sync_seq;
> > +     r = amdgpu_fence_wait_polling(ring,
> > +                                   seq - 
> ring->fence_drv.num_fences_mask,
>
> That's what's amdgpu_fence_emit_polling() does anyway.
>
> So this here just moves the polling a bit earlier.
>
> I think we rather need to increase the MES ring size instead.
>
> Regards,
> Christian.
>
>
> > +                                   timeout);
> > +     if (r < 1)
> > +             goto error_undo;
> > +
> >        api_status = (struct MES_API_STATUS *)((char *)pkt + 
> api_status_off);
> >        api_status->api_completion_fence_addr = status_gpu_addr;
> >        api_status->api_completion_fence_value = 1;
> > @@ -194,8 +201,7 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> > mes_status_pkt.api_status.api_completion_fence_addr =
> >                ring->fence_drv.gpu_addr;
> > - mes_status_pkt.api_status.api_completion_fence_value =
> > -             ++ring->fence_drv.sync_seq;
> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
> >
> >        amdgpu_ring_write_multiple(ring, &mes_status_pkt,
> > sizeof(mes_status_pkt) / 4);
> > @@ -215,7 +221,7 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >                dev_dbg(adev->dev, "MES msg=%d was emitted\n",
> >                        x_pkt->header.opcode);
> >
> > -     r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, 
> timeout);
> > +     r = amdgpu_fence_wait_polling(ring, seq, timeout);
> >        if (r < 1 || !*status_ptr) {
> >
> >                if (misc_op_str)
> > @@ -238,6 +244,10 @@ static int 
> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> >        amdgpu_device_wb_free(adev, status_offset);
> >        return 0;
> >
> > +error_undo:
> > +     dev_err(adev->dev, "MES ring buffer is full.\n");
> > +     amdgpu_ring_undo(ring);
> > +
> >   error_unlock_free:
> > spin_unlock_irqrestore(&mes->ring_lock, flags);
> >
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240722/cafbe594/attachment-0001.htm>


More information about the amd-gfx mailing list