[PATCH] drm/amd: Add a workaround for GFX11 systems that fail to flush TLB

Wed Dec 13 17:04:54 UTC 2023

Some systems with MP1 13.0.4 or 13.0.11 have a firmware bug that
causes the first MES packet after resume to fail. This packet is
used to flush the TLB when GART is enabled.

This issue is fixed in newer firmware, but as OEMs may not roll this
out to the field, introduce a workaround that will retry the flush
when detecting running on an older firmware and decrease relevant
error messages to debug while workaround is in use.

Cc: stable at vger.kernel.org # 6.1+
Cc: Tim Huang <Tim.Huang at amd.com>
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3045
Signed-off-by: Mario Limonciello <mario.limonciello at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 10 ++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 17 ++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  8 ++++++--
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 9ddbf1494326..6ce3f6e6b6de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -836,8 +836,14 @@ int amdgpu_mes_reg_write_reg_wait(struct amdgpu_device *adev,
 	}
 
 	r = adev->mes.funcs->misc_op(&adev->mes, &op_input);
-	if (r)
-		DRM_ERROR("failed to reg_write_reg_wait\n");
+	if (r) {
+		const char *msg = "failed to reg_write_reg_wait\n";
+
+		if (adev->mes.suspend_workaround)
+			DRM_DEBUG(msg);
+		else
+			DRM_ERROR(msg);
+	}
 
 error:
 	return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index a27b424ffe00..90f2bba3b12b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -135,6 +135,8 @@ struct amdgpu_mes {
 
 	/* ip specific functions */
 	const struct amdgpu_mes_funcs   *funcs;
+
+	bool				suspend_workaround;
 };
 
 struct amdgpu_mes_process {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 23d7b548d13f..e810c7bb3156 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -889,7 +889,11 @@ static int gmc_v11_0_gart_enable(struct amdgpu_device *adev)
 		false : true;
 
 	adev->mmhub.funcs->set_fault_enable_default(adev, value);
-	gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+
+	do {
+		gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+		adev->mes.suspend_workaround = false;
+	} while (adev->mes.suspend_workaround);
 
 	DRM_INFO("PCIE GART of %uM enabled (table at 0x%016llX).\n",
 		 (unsigned int)(adev->gmc.gart_size >> 20),
@@ -960,6 +964,17 @@ static int gmc_v11_0_resume(void *handle)
 	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+	case IP_VERSION(13, 0, 4):
+	case IP_VERSION(13, 0, 11):
+		/* avoid problems with first TLB flush after resume */
+		if ((adev->pm.fw_version & 0x00FFFFFF) < 0x004c4900)
+			adev->mes.suspend_workaround = adev->in_s0ix;
+		break;
+	default:
+		break;
+	}
+
 	r = gmc_v11_0_hw_init(adev);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 4dfec56e1b7f..84ab8c611e5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -137,8 +137,12 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
 		      timeout);
 	if (r < 1) {
-		DRM_ERROR("MES failed to response msg=%d\n",
-			  x_pkt->header.opcode);
+		if (mes->suspend_workaround)
+			DRM_DEBUG("MES failed to response msg=%d\n",
+				  x_pkt->header.opcode);
+		else
+			DRM_ERROR("MES failed to response msg=%d\n",
+				  x_pkt->header.opcode);
 
 		while (halt_if_hws_hang)
 			schedule();
-- 
2.34.1