[PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
Christian König
ckoenig.leichtzumerken at gmail.com
Mon Jul 22 11:20:54 UTC 2024
What I meant is that the MES ring is now to small for the number of
packets written to it.
Either reduce the num_hw_submission or increase the MES ring size.
E.g. see this code here in amdgpu_ring_init:
if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
sched_hw_submission = max(sched_hw_submission, 256);
else if (ring == &adev->sdma.instance[0].page)
sched_hw_submission = 256;
We are basically just missing a case for the MES here as far as I can see.
Regards,
Christian.
Am 22.07.24 um 10:46 schrieb Xiao, Jack:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> />> Can you try to reduce num_hw_submission for the MES ring?
> /
> Smaller num_hw_submission should not help for this issue, for Mes work
> without drm scheduler like legacy kiq. Smaller num_hw_submission will
> result in smaller mes ring size and more waiting time.
>
> Regards,
> Jack
> ------------------------------------------------------------------------
> *From:* Christian König <ckoenig.leichtzumerken at gmail.com>
> *Sent:* Monday, 22 July 2024 16:20
> *To:* Xiao, Jack <Jack.Xiao at amd.com>; amd-gfx at lists.freedesktop.org
> <amd-gfx at lists.freedesktop.org>; Deucher, Alexander
> <Alexander.Deucher at amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
> Thx, but in that case this patch here then won't help either it just
> mitigates the problem.
>
> Can you try to reduce num_hw_submission for the MES ring?
>
> Thanks,
> Christian.
>
> Am 22.07.24 um 05:27 schrieb Xiao, Jack:
>>
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>>
>> />> I think we rather need to increase the MES ring size instead./
>>
>> Unfortunately, it doesn't work. I guess mes firmware has limitation.
>>
>> Regards,
>> Jack
>>
>> ------------------------------------------------------------------------
>> *From:* Christian König <ckoenig.leichtzumerken at gmail.com>
>> <mailto:ckoenig.leichtzumerken at gmail.com>
>> *Sent:* Friday, 19 July 2024 23:44
>> *To:* Xiao, Jack <Jack.Xiao at amd.com> <mailto:Jack.Xiao at amd.com>;
>> amd-gfx at lists.freedesktop.org <mailto:amd-gfx at lists.freedesktop.org>
>> <amd-gfx at lists.freedesktop.org>
>> <mailto:amd-gfx at lists.freedesktop.org>; Deucher, Alexander
>> <Alexander.Deucher at amd.com> <mailto:Alexander.Deucher at amd.com>
>> *Subject:* Re: [PATCH] drm/amdgpu/mes: fix mes ring buffer overflow
>> Am 19.07.24 um 11:16 schrieb Jack Xiao:
>> > wait memory room until enough before writing mes packets
>> > to avoid ring buffer overflow.
>> >
>> > Signed-off-by: Jack Xiao <Jack.Xiao at amd.com> <mailto:Jack.Xiao at amd.com>
>> > ---
>> > drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++----
>> > drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++----
>> > 2 files changed, 28 insertions(+), 8 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> > index 8ce51b9236c1..68c74adf79f1 100644
>> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> > @@ -168,7 +168,7 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > const char *op_str, *misc_op_str;
>> > unsigned long flags;
>> > u64 status_gpu_addr;
>> > - u32 status_offset;
>> > + u32 seq, status_offset;
>> > u64 *status_ptr;
>> > signed long r;
>> > int ret;
>> > @@ -196,6 +196,13 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > if (r)
>> > goto error_unlock_free;
>> >
>> > + seq = ++ring->fence_drv.sync_seq;
>> > + r = amdgpu_fence_wait_polling(ring,
>> > + seq -
>> ring->fence_drv.num_fences_mask,
>> > + timeout);
>> > + if (r < 1)
>> > + goto error_undo;
>> > +
>> > api_status = (struct MES_API_STATUS *)((char *)pkt +
>> api_status_off);
>> > api_status->api_completion_fence_addr = status_gpu_addr;
>> > api_status->api_completion_fence_value = 1;
>> > @@ -208,8 +215,7 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>> > mes_status_pkt.api_status.api_completion_fence_addr =
>> > ring->fence_drv.gpu_addr;
>> > - mes_status_pkt.api_status.api_completion_fence_value =
>> > - ++ring->fence_drv.sync_seq;
>> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
>> >
>> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>> > sizeof(mes_status_pkt) / 4);
>> > @@ -229,7 +235,7 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>> > x_pkt->header.opcode);
>> >
>> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
>> timeout);
>> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
>> > if (r < 1 || !*status_ptr) {
>> >
>> > if (misc_op_str)
>> > @@ -252,6 +258,10 @@ static int
>> mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > amdgpu_device_wb_free(adev, status_offset);
>> > return 0;
>> >
>> > +error_undo:
>> > + dev_err(adev->dev, "MES ring buffer is full.\n");
>> > + amdgpu_ring_undo(ring);
>> > +
>> > error_unlock_free:
>> > spin_unlock_irqrestore(&mes->ring_lock, flags);
>> >
>> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> > index c9f74231ad59..48e01206bcc4 100644
>> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>> > @@ -154,7 +154,7 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > const char *op_str, *misc_op_str;
>> > unsigned long flags;
>> > u64 status_gpu_addr;
>> > - u32 status_offset;
>> > + u32 seq, status_offset;
>> > u64 *status_ptr;
>> > signed long r;
>> > int ret;
>> > @@ -182,6 +182,13 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > if (r)
>> > goto error_unlock_free;
>> >
>> > + seq = ++ring->fence_drv.sync_seq;
>> > + r = amdgpu_fence_wait_polling(ring,
>> > + seq -
>> ring->fence_drv.num_fences_mask,
>>
>> That's what's amdgpu_fence_emit_polling() does anyway.
>>
>> So this here just moves the polling a bit earlier.
>>
>> I think we rather need to increase the MES ring size instead.
>>
>> Regards,
>> Christian.
>>
>>
>> > + timeout);
>> > + if (r < 1)
>> > + goto error_undo;
>> > +
>> > api_status = (struct MES_API_STATUS *)((char *)pkt +
>> api_status_off);
>> > api_status->api_completion_fence_addr = status_gpu_addr;
>> > api_status->api_completion_fence_value = 1;
>> > @@ -194,8 +201,7 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>> > mes_status_pkt.api_status.api_completion_fence_addr =
>> > ring->fence_drv.gpu_addr;
>> > - mes_status_pkt.api_status.api_completion_fence_value =
>> > - ++ring->fence_drv.sync_seq;
>> > + mes_status_pkt.api_status.api_completion_fence_value = seq;
>> >
>> > amdgpu_ring_write_multiple(ring, &mes_status_pkt,
>> > sizeof(mes_status_pkt) / 4);
>> > @@ -215,7 +221,7 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > dev_dbg(adev->dev, "MES msg=%d was emitted\n",
>> > x_pkt->header.opcode);
>> >
>> > - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
>> timeout);
>> > + r = amdgpu_fence_wait_polling(ring, seq, timeout);
>> > if (r < 1 || !*status_ptr) {
>> >
>> > if (misc_op_str)
>> > @@ -238,6 +244,10 @@ static int
>> mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>> > amdgpu_device_wb_free(adev, status_offset);
>> > return 0;
>> >
>> > +error_undo:
>> > + dev_err(adev->dev, "MES ring buffer is full.\n");
>> > + amdgpu_ring_undo(ring);
>> > +
>> > error_unlock_free:
>> > spin_unlock_irqrestore(&mes->ring_lock, flags);
>> >
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240722/d15380d0/attachment-0001.htm>
More information about the amd-gfx
mailing list