[PATCH v2] drm/amdgpu: Start using amdgpu_ras_add_bad_pages

Andrey Grodzovsky andrey.grodzovsky at amd.com
Tue Aug 13 17:59:11 UTC 2019


v2: Trigger GPU reset in case of new bad address errors.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 296e2d9..f5f36ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -243,17 +243,40 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 		struct ras_err_data *err_data,
 		struct amdgpu_iv_entry *entry)
 {
+	unsigned long new_err_addr_cnt, old_err_addr_cnt;
+	new_err_addr_cnt = 0;
+	old_err_addr_cnt = err_data->err_addr_cnt;
+
 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 	if (adev->umc.funcs->query_ras_error_count)
 		adev->umc.funcs->query_ras_error_count(adev, err_data);
 	/* umc query_ras_error_address is also responsible for clearing
 	 * error status
 	 */
-	if (adev->umc.funcs->query_ras_error_address)
+	if (adev->umc.funcs->query_ras_error_address) {
+		unsigned long *bps;
+		int i;
+
 		adev->umc.funcs->query_ras_error_address(adev, err_data);
 
+		new_err_addr_cnt = err_data->err_addr_cnt - old_err_addr_cnt;
+
+		if (new_err_addr_cnt) {
+			bps = kcalloc(new_err_addr_cnt, sizeof(*bps), GFP_KERNEL);
+			if (!bps)
+				return -ENOMEM;
+
+			for (i = 0; i < new_err_addr_cnt; i++)
+				bps[i] = err_data->err_addr[old_err_addr_cnt + i]  >> PAGE_SHIFT;
+
+			amdgpu_ras_add_bad_pages(adev, bps, new_err_addr_cnt);
+
+			kfree(bps);
+		}
+	}
+
 	/* only uncorrectable error needs gpu reset */
-	if (err_data->ue_count)
+	if (err_data->ue_count || new_err_addr_cnt)
 		amdgpu_ras_reset_gpu(adev, 0);
 
 	return AMDGPU_RAS_SUCCESS;
-- 
2.7.4



More information about the amd-gfx mailing list