[PATCH] amd/amdgpu: perform soft reset for sdma fed error

YiPeng Chai YiPeng.Chai at amd.com
Fri May 12 07:47:50 UTC 2023


When testing sdma ib ring fails to detect sdma
hang for sdma fed error, force to perform soft
reset.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index 3d9a80511a45..1d463e1fd3ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -53,6 +53,12 @@ MODULE_FIRMWARE("amdgpu/sdma_6_0_3.bin");
 #define SDMA0_HYP_DEC_REG_START 0x5880
 #define SDMA0_HYP_DEC_REG_END 0x589a
 #define SDMA1_HYP_DEC_REG_OFFSET 0x20
+#define regRLC_RLCS_FED_STATUS_0                     0x4eff
+#define regRLC_RLCS_FED_STATUS_0_BASE_IDX            1
+#define RLC_RLCS_FED_STATUS_0__SDMA0_FED_ERR_MASK    0x00000040L
+#define RLC_RLCS_FED_STATUS_0__SDMA1_FED_ERR_MASK    0x00000080L
+#define RLC_RLCS_FED_STATUS_0__SDMA0_FED_ERR__SHIFT  0x6
+#define RLC_RLCS_FED_STATUS_0__SDMA1_FED_ERR__SHIFT  0x7
 
 static void sdma_v6_0_set_ring_funcs(struct amdgpu_device *adev);
 static void sdma_v6_0_set_buffer_funcs(struct amdgpu_device *adev);
@@ -760,6 +766,7 @@ static bool sdma_v6_0_check_soft_reset(void *handle)
 	struct amdgpu_ring *ring;
 	int i, r;
 	long tmo = msecs_to_jiffies(1000);
+	uint32_t rlc_status0 = 0;
 
 	for (i = 0; i < adev->sdma.num_instances; i++) {
 		ring = &adev->sdma.instance[i].ring;
@@ -768,6 +775,15 @@ static bool sdma_v6_0_check_soft_reset(void *handle)
 			return true;
 	}
 
+	/* Soft reset should be performed for sdma fed error to
+	 * recover sdma engine, so when testing sdma ib ring fails
+	 * to detect sdma hang, force to perform soft reset.
+	 */
+	rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0);
+	if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA0_FED_ERR) ||
+	    REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA1_FED_ERR))
+		return true;
+
 	return false;
 }
 
-- 
2.34.1



More information about the amd-gfx mailing list