[PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create
Chen, Guchun
Guchun.Chen at amd.com
Mon Sep 30 03:26:06 UTC 2019
Regards,
Guchun
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Monday, September 30, 2019 11:20 AM
To: amd-gfx at lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create
implement ras_create_bad_pages_bo to simplify ras code
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 72 +++++++++++--------------
1 file changed, 31 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d1bafa92ca91..fe3a57e567c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1430,41 +1430,53 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
return ret;
}
-/* called in gpu recovery/init */
-int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
+static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)
{
+ /* Note: the caller should guarantee con and data are not NULL */
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct ras_err_handler_data *data;
+ struct ras_err_handler_data *data = con->eh_data;
uint64_t bp;
- struct amdgpu_bo *bo = NULL;
- int i, ret = 0;
-
- if (!con || !con->eh_data)
- return 0;
+ struct amdgpu_bo *bo;
+ int i;
- mutex_lock(&con->recovery_lock);
- data = con->eh_data;
- if (!data)
- goto out;
- /* reserve vram at driver post stage. */
for (i = data->last_reserved; i < data->count; i++) {
+ bo = NULL;
bp = data->bps[i].retired_page;
- /* There are two cases of reserve error should be ignored:
+ /* There are three cases of reserve error should be ignored:
* 1) a ras bad page has been allocated (used by someone);
* 2) a ras bad page has been reserved (duplicate error injection
* for one page);
+ * 3) bo info isn't lost in gpu reset
*/
if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
AMDGPU_GPU_PAGE_SIZE,
AMDGPU_GEM_DOMAIN_VRAM,
&bo, NULL))
- DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp);
-
- data->bps_bo[i] = bo;
+ DRM_NOTE("RAS NOTE: reserve vram for retired page 0x%llx fail\n", bp);
+ else
+ data->bps_bo[i] = bo;
[Guchun]The "else" should not needed? Otherwise, if amdgpu_bo_create_kernel_at always succeeds, we don't catch a chance to update bps_bo.
Is that true?
data->last_reserved = i + 1;
- bo = NULL;
}
+}
+
+/* called in gpu recovery/init */
+int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) {
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data *data;
+ int ret = 0;
+
+ if (!con || !con->eh_data)
+ return 0;
+
+ mutex_lock(&con->recovery_lock);
+ data = con->eh_data;
+ if (!data)
+ goto out;
+
+ /* reserve vram at driver post stage. */
+ amdgpu_ras_create_bad_pages_bo(adev);
/* continue to save bad pages to eeprom even reesrve_vram fails */
ret = amdgpu_ras_save_bad_pages(adev); @@ -1583,9 +1595,6 @@ void amdgpu_ras_recovery_reset(struct amdgpu_device *adev) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
- uint64_t bp;
- struct amdgpu_bo *bo = NULL;
- int i;
if (!con || !con->eh_data)
return;
@@ -1600,26 +1609,7 @@ void amdgpu_ras_recovery_reset(struct amdgpu_device *adev)
data = con->eh_data;
/* force to reserve all retired page again */
data->last_reserved = 0;
-
- for (i = data->last_reserved; i < data->count; i++) {
- bp = data->bps[i].retired_page;
-
- /* There are three cases of reserve error should be ignored:
- * 1) a ras bad page has been allocated (used by someone);
- * 2) a ras bad page has been reserved (duplicate error injection
- * for one page);
- * 3) bo info isn't lost in gpu reset
- */
- if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
- AMDGPU_GPU_PAGE_SIZE,
- AMDGPU_GEM_DOMAIN_VRAM,
- &bo, NULL))
- DRM_NOTE("RAS NOTE: recreate bo for retired page 0x%llx fail\n", bp);
- else
- data->bps_bo[i] = bo;
- data->last_reserved = i + 1;
- bo = NULL;
- }
+ amdgpu_ras_create_bad_pages_bo(adev);
}
/* recovery end */
--
2.17.1
More information about the amd-gfx
mailing list