[PATCH v2] drm/amd: Add a workaround for GFX11 systems that fail to flush TLB
Christian König
ckoenig.leichtzumerken at gmail.com
Thu Dec 14 10:28:43 UTC 2023
Am 13.12.23 um 21:31 schrieb Mario Limonciello:
> Some systems with MP1 13.0.4 or 13.0.11 have a firmware bug that
> causes the first MES packet after resume to fail. Typically this
> packet is used to flush the TLB when GART is enabled.
>
> This issue is fixed in newer firmware, but as OEMs may not roll this
> out to the field, introduce a workaround that will add an extra dummy
> read on resume that the result is discarded.
>
> Cc: stable at vger.kernel.org # 6.1+
> Cc: Tim Huang <Tim.Huang at amd.com>
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3045
> Signed-off-by: Mario Limonciello <mario.limonciello at amd.com>
> ---
> v1->v2:
> * Add a dummy read callback instead and use that.
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 19 +++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 3 +++
> drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 11 +++++++++++
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 8 ++++++--
> 4 files changed, 39 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 9ddbf1494326..cd5e1a027bdf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -868,6 +868,25 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg,
> return r;
> }
>
> +void amdgpu_mes_reg_dummy_read(struct amdgpu_device *adev)
> +{
> + struct mes_misc_op_input op_input = {
> + .op = MES_MISC_OP_READ_REG,
> + .read_reg.reg_offset = 0,
> + .read_reg.buffer_addr = adev->mes.read_val_gpu_addr,
> + };
> +
> + if (!adev->mes.funcs->misc_op) {
> + DRM_ERROR("mes misc op is not supported!\n");
> + return;
> + }
> +
> + adev->mes.silent_errors = true;
I really think we should not have hacks like that.
Let's rather adjust the error message to note that updating the firmware
might help.
Regards,
Christian.
> + if (adev->mes.funcs->misc_op(&adev->mes, &op_input))
> + DRM_DEBUG("failed to amdgpu_mes_reg_dummy_read\n");
> + adev->mes.silent_errors = false;
> +}
> +
> int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
> uint64_t process_context_addr,
> uint32_t spi_gdbg_per_vmid_cntl,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index a27b424ffe00..d208e60c1d99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -135,6 +135,8 @@ struct amdgpu_mes {
>
> /* ip specific functions */
> const struct amdgpu_mes_funcs *funcs;
> +
> + bool silent_errors;
> };
>
> struct amdgpu_mes_process {
> @@ -356,6 +358,7 @@ int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,
> u64 gpu_addr, u64 seq);
>
> uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg);
> +void amdgpu_mes_reg_dummy_read(struct amdgpu_device *adev);
> int amdgpu_mes_wreg(struct amdgpu_device *adev,
> uint32_t reg, uint32_t val);
> int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 23d7b548d13f..a2ba45f859ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -960,6 +960,17 @@ static int gmc_v11_0_resume(void *handle)
> int r;
> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
> + switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
> + case IP_VERSION(13, 0, 4):
> + case IP_VERSION(13, 0, 11):
> + /* avoid a lost packet @ first GFXOFF exit after resume */
> + if ((adev->pm.fw_version & 0x00FFFFFF) < 0x004c4900 && adev->in_s0ix)
> + amdgpu_mes_reg_dummy_read(adev);
> + break;
> + default:
> + break;
> + }
> +
> r = gmc_v11_0_hw_init(adev);
> if (r)
> return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 4dfec56e1b7f..71df5cb65485 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -137,8 +137,12 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
> timeout);
> if (r < 1) {
> - DRM_ERROR("MES failed to response msg=%d\n",
> - x_pkt->header.opcode);
> + if (mes->silent_errors)
> + DRM_DEBUG("MES failed to response msg=%d\n",
> + x_pkt->header.opcode);
> + else
> + DRM_ERROR("MES failed to response msg=%d\n",
> + x_pkt->header.opcode);
>
> while (halt_if_hws_hang)
> schedule();
More information about the amd-gfx
mailing list