[PATCH v2] drm/amd: Add a workaround for GFX11 systems that fail to flush TLB

Christian König ckoenig.leichtzumerken at gmail.com
Thu Dec 14 10:28:43 UTC 2023



Am 13.12.23 um 21:31 schrieb Mario Limonciello:
> Some systems with MP1 13.0.4 or 13.0.11 have a firmware bug that
> causes the first MES packet after resume to fail. Typically this
> packet is used to flush the TLB when GART is enabled.
>
> This issue is fixed in newer firmware, but as OEMs may not roll this
> out to the field, introduce a workaround that will add an extra dummy
> read on resume that the result is discarded.
>
> Cc: stable at vger.kernel.org # 6.1+
> Cc: Tim Huang <Tim.Huang at amd.com>
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3045
> Signed-off-by: Mario Limonciello <mario.limonciello at amd.com>
> ---
> v1->v2:
>   * Add a dummy read callback instead and use that.
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 19 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  3 +++
>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 11 +++++++++++
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  8 ++++++--
>   4 files changed, 39 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 9ddbf1494326..cd5e1a027bdf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -868,6 +868,25 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg,
>   	return r;
>   }
>   
> +void amdgpu_mes_reg_dummy_read(struct amdgpu_device *adev)
> +{
> +	struct mes_misc_op_input op_input = {
> +		.op = MES_MISC_OP_READ_REG,
> +		.read_reg.reg_offset = 0,
> +		.read_reg.buffer_addr = adev->mes.read_val_gpu_addr,
> +	};
> +
> +	if (!adev->mes.funcs->misc_op) {
> +		DRM_ERROR("mes misc op is not supported!\n");
> +		return;
> +	}
> +
> +	adev->mes.silent_errors = true;

I really think we should not have hacks like that.

Let's rather adjust the error message to note that updating the firmware 
might help.

Regards,
Christian.

> +	if (adev->mes.funcs->misc_op(&adev->mes, &op_input))
> +		DRM_DEBUG("failed to amdgpu_mes_reg_dummy_read\n");
> +	adev->mes.silent_errors = false;
> +}
> +
>   int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
>   				uint64_t process_context_addr,
>   				uint32_t spi_gdbg_per_vmid_cntl,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index a27b424ffe00..d208e60c1d99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -135,6 +135,8 @@ struct amdgpu_mes {
>   
>   	/* ip specific functions */
>   	const struct amdgpu_mes_funcs   *funcs;
> +
> +	bool				silent_errors;
>   };
>   
>   struct amdgpu_mes_process {
> @@ -356,6 +358,7 @@ int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,
>   				  u64 gpu_addr, u64 seq);
>   
>   uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg);
> +void amdgpu_mes_reg_dummy_read(struct amdgpu_device *adev);
>   int amdgpu_mes_wreg(struct amdgpu_device *adev,
>   		    uint32_t reg, uint32_t val);
>   int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 23d7b548d13f..a2ba45f859ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -960,6 +960,17 @@ static int gmc_v11_0_resume(void *handle)
>   	int r;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
> +	case IP_VERSION(13, 0, 4):
> +	case IP_VERSION(13, 0, 11):
> +		/* avoid a lost packet @ first GFXOFF exit after resume */
> +		if ((adev->pm.fw_version & 0x00FFFFFF) < 0x004c4900 && adev->in_s0ix)
> +			amdgpu_mes_reg_dummy_read(adev);
> +		break;
> +	default:
> +		break;
> +	}
> +
>   	r = gmc_v11_0_hw_init(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 4dfec56e1b7f..71df5cb65485 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -137,8 +137,12 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
>   	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
>   		      timeout);
>   	if (r < 1) {
> -		DRM_ERROR("MES failed to response msg=%d\n",
> -			  x_pkt->header.opcode);
> +		if (mes->silent_errors)
> +			DRM_DEBUG("MES failed to response msg=%d\n",
> +				  x_pkt->header.opcode);
> +		else
> +			DRM_ERROR("MES failed to response msg=%d\n",
> +				  x_pkt->header.opcode);
>   
>   		while (halt_if_hws_hang)
>   			schedule();



More information about the amd-gfx mailing list