[PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure
Chen, Guchun
Guchun.Chen at amd.com
Mon Sep 2 02:13:06 UTC 2019
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Friday, August 30, 2019 8:25 PM
To: amd-gfx at lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure
change bps type from retired page to eeprom table record, prepare for saving error records to eeprom
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++--------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
2 files changed, 43 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2ca3997d4b3a..24663ec41248 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1187,14 +1187,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
for (; i < data->count; i++) {
(*bps)[i] = (struct ras_badpage){
- .bp = data->bps[i].bp,
+ .bp = data->bps[i].retired_page,
.size = AMDGPU_GPU_PAGE_SIZE,
.flags = 0,
};
if (data->last_reserved <= i)
(*bps)[i].flags = 1;
- else if (data->bps[i].bo == NULL)
+ else if (data->bps_bo[i] == NULL)
(*bps)[i].flags = 2;
}
@@ -1288,30 +1288,40 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, {
unsigned int old_space = data->count + data->space_left;
unsigned int new_space = old_space + pages;
- unsigned int align_space = ALIGN(new_space, 1024);
- void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
-
- if (!tmp)
+ unsigned int align_space = ALIGN(new_space, 512);
[Guchun]Any special reason to change alignment from 512 to 1024?
+ void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+ struct amdgpu_bo **bps_bo =
+ kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
+
+ if (!bps || !bps_bo) {
+ kfree(bps);
+ kfree(bps_bo);
return -ENOMEM;
+ }
if (data->bps) {
- memcpy(tmp, data->bps,
+ memcpy(bps, data->bps,
data->count * sizeof(*data->bps));
kfree(data->bps);
}
+ if (data->bps_bo) {
+ memcpy(bps_bo, data->bps_bo,
+ data->count * sizeof(*data->bps_bo));
+ kfree(data->bps_bo);
+ }
- data->bps = tmp;
+ data->bps = bps;
+ data->bps_bo = bps_bo;
data->space_left += align_space - old_space;
return 0;
}
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
- unsigned long *bps, int pages)
+ struct eeprom_table_record *bps, int pages)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
- int i = pages;
int ret = 0;
if (!con || !con->eh_data || !bps || pages <= 0) @@ -1328,10 +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
goto out;
}
- while (i--)
- data->bps[data->count++].bp = bps[i];
-
+ memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
+ data->count += pages;
data->space_left -= pages;
+
out:
mutex_unlock(&con->recovery_lock);
@@ -1356,13 +1366,13 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
goto out;
/* reserve vram at driver post stage. */
for (i = data->last_reserved; i < data->count; i++) {
- bp = data->bps[i].bp;
+ bp = data->bps[i].retired_page;
if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
PAGE_SIZE, &bo))
DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
- data->bps[i].bo = bo;
+ data->bps_bo[i] = bo;
data->last_reserved = i + 1;
}
out:
@@ -1387,11 +1397,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
goto out;
for (i = data->last_reserved - 1; i >= 0; i--) {
- bo = data->bps[i].bo;
+ bo = data->bps_bo[i];
amdgpu_ras_release_vram(adev, &bo);
- data->bps[i].bo = bo;
+ data->bps_bo[i] = bo;
data->last_reserved = i;
}
out:
@@ -1407,12 +1417,19 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
return 0;
}
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) {
- /* TODO
- * read the array to eeprom when SMU disabled.
- */
- return 0;
+ struct eeprom_table_record *bps = NULL;
+ int ret;
+
+ ret = amdgpu_ras_add_bad_pages(adev, bps,
+ adev->umc.max_ras_err_cnt_per_query);
+
+ return ret;
}
static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 66b71525446e..b6bac873c588 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -351,11 +351,10 @@ struct ras_err_data { };
struct ras_err_handler_data {
- /* point to bad pages array */
- struct {
- unsigned long bp;
- struct amdgpu_bo *bo;
- } *bps;
+ /* point to bad page records array */
+ struct eeprom_table_record *bps;
+ /* point to reserved bo array */
+ struct amdgpu_bo **bps_bo;
/* the count of entries */
int count;
/* the space can place new entries */
@@ -492,7 +491,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
/* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
- unsigned long *bps, int pages);
+ struct eeprom_table_record *bps, int pages);
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
--
2.17.1
_______________________________________________
amd-gfx mailing list
amd-gfx at lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list