[PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create

Chen, Guchun Guchun.Chen at amd.com
Mon Sep 30 03:26:06 UTC 2019


Regards,
Guchun

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com> 
Sent: Monday, September 30, 2019 11:20 AM
To: amd-gfx at lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create

implement ras_create_bad_pages_bo to simplify ras code

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 72 +++++++++++--------------
 1 file changed, 31 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d1bafa92ca91..fe3a57e567c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1430,41 +1430,53 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
 	return ret;
 }
 
-/* called in gpu recovery/init */
-int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
+static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)
 {
+	/* Note: the caller should guarantee con and data are not NULL */
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-	struct ras_err_handler_data *data;
+	struct ras_err_handler_data *data = con->eh_data;
 	uint64_t bp;
-	struct amdgpu_bo *bo = NULL;
-	int i, ret = 0;
-
-	if (!con || !con->eh_data)
-		return 0;
+	struct amdgpu_bo *bo;
+	int i;
 
-	mutex_lock(&con->recovery_lock);
-	data = con->eh_data;
-	if (!data)
-		goto out;
-	/* reserve vram at driver post stage. */
 	for (i = data->last_reserved; i < data->count; i++) {
+		bo = NULL;
 		bp = data->bps[i].retired_page;
 
-		/* There are two cases of reserve error should be ignored:
+		/* There are three cases of reserve error should be ignored:
 		 * 1) a ras bad page has been allocated (used by someone);
 		 * 2) a ras bad page has been reserved (duplicate error injection
 		 *    for one page);
+		 * 3) bo info isn't lost in gpu reset
 		 */
 		if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
 					       AMDGPU_GPU_PAGE_SIZE,
 					       AMDGPU_GEM_DOMAIN_VRAM,
 					       &bo, NULL))
-			DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp);
-
-		data->bps_bo[i] = bo;
+			DRM_NOTE("RAS NOTE: reserve vram for retired page 0x%llx fail\n", bp);
+		else
+			data->bps_bo[i] = bo;
[Guchun]The "else" should not needed? Otherwise, if amdgpu_bo_create_kernel_at always succeeds, we don't catch a chance to update bps_bo.
Is that true?
 		data->last_reserved = i + 1;
-		bo = NULL;
 	}
+}
+
+/* called in gpu recovery/init */
+int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) {
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	int ret = 0;
+
+	if (!con || !con->eh_data)
+		return 0;
+
+	mutex_lock(&con->recovery_lock);
+	data = con->eh_data;
+	if (!data)
+		goto out;
+
+	/* reserve vram at driver post stage. */
+	amdgpu_ras_create_bad_pages_bo(adev);
 
 	/* continue to save bad pages to eeprom even reesrve_vram fails */
 	ret = amdgpu_ras_save_bad_pages(adev); @@ -1583,9 +1595,6 @@ void amdgpu_ras_recovery_reset(struct amdgpu_device *adev)  {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data;
-	uint64_t bp;
-	struct amdgpu_bo *bo = NULL;
-	int i;
 
 	if (!con || !con->eh_data)
 		return;
@@ -1600,26 +1609,7 @@ void amdgpu_ras_recovery_reset(struct amdgpu_device *adev)
 	data = con->eh_data;
 	/* force to reserve all retired page again */
 	data->last_reserved = 0;
-
-	for (i = data->last_reserved; i < data->count; i++) {
-		bp = data->bps[i].retired_page;
-
-		/* There are three cases of reserve error should be ignored:
-		 * 1) a ras bad page has been allocated (used by someone);
-		 * 2) a ras bad page has been reserved (duplicate error injection
-		 *    for one page);
-		 * 3) bo info isn't lost in gpu reset
-		 */
-		if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
-					       AMDGPU_GPU_PAGE_SIZE,
-					       AMDGPU_GEM_DOMAIN_VRAM,
-					       &bo, NULL))
-			DRM_NOTE("RAS NOTE: recreate bo for retired page 0x%llx fail\n", bp);
-		else
-			data->bps_bo[i] = bo;
-		data->last_reserved = i + 1;
-		bo = NULL;
-	}
+	amdgpu_ras_create_bad_pages_bo(adev);
 }
 /* recovery end */
 
--
2.17.1



More information about the amd-gfx mailing list