[PATCH 2/2] drm/amdgpu: adjust the update of RAS bad page number
Tao Zhou
tao.zhou1 at amd.com
Fri Jul 11 09:06:04 UTC 2025
One eeprom record may not map to unit number of bad pages, the accurate
bad page number is gotten after bad page address check.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +++++++++++--------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 6 +--
3 files changed, 31 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1d6d4625abb3..ea3ab8c46115 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2568,6 +2568,9 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
}
for (; i < data->count; i++) {
+ if (!data->bps[i].ts)
+ continue;
+
(*bps)[i] = (struct ras_badpage){
.bp = data->bps[i].retired_page,
.size = AMDGPU_GPU_PAGE_SIZE,
@@ -2581,7 +2584,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
}
- *count = data->count;
+ *count = con->bad_page_num;
out:
mutex_unlock(&con->recovery_lock);
return ret;
@@ -2809,8 +2812,11 @@ static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
for (j = 0; j < count; j++) {
if (amdgpu_ras_check_bad_page_unlock(con,
- bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
+ data->count++;
+ data->space_left--;
continue;
+ }
if (!data->space_left &&
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
@@ -2823,6 +2829,7 @@ static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
sizeof(struct eeprom_table_record));
data->count++;
data->space_left--;
+ con->bad_page_num++;
}
return 0;
@@ -2954,7 +2961,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
&bps[i], &err_data, nps);
if (ret)
- control->ras_num_bad_pages -= adev->umc.retire_unit;
+ con->bad_page_num -= adev->umc.retire_unit;
i += (adev->umc.retire_unit - 1);
} else {
break;
@@ -2968,8 +2975,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
ret = __amdgpu_ras_convert_rec_from_rom(adev,
&bps[i], &err_data, nps);
if (ret)
- control->ras_num_bad_pages -= adev->umc.retire_unit;
+ con->bad_page_num -= adev->umc.retire_unit;
}
+
+ con->eh_data->count_saved = con->eh_data->count;
} else {
ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
}
@@ -2992,7 +3001,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control;
- int save_count, unit_num, bad_page_num, i;
+ int unit_num, i;
if (!con || !con->eh_data) {
if (new_cnt)
@@ -3013,27 +3022,25 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
- bad_page_num = control->ras_num_bad_pages;
- save_count = data->count - bad_page_num;
+ unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs;
mutex_unlock(&con->recovery_lock);
- unit_num = save_count / adev->umc.retire_unit;
if (new_cnt)
*new_cnt = unit_num;
/* only new entries are saved */
- if (save_count > 0) {
+ if (unit_num > 0) {
/*old asics only save pa to eeprom like before*/
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
if (amdgpu_ras_eeprom_append(control,
- &data->bps[bad_page_num], save_count)) {
+ &data->bps[data->count_saved], unit_num)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
}
} else {
for (i = 0; i < unit_num; i++) {
if (amdgpu_ras_eeprom_append(control,
- &data->bps[bad_page_num +
+ &data->bps[data->count_saved +
i * adev->umc.retire_unit], 1)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
@@ -3041,7 +3048,9 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
}
}
- dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
+ dev_info(adev->dev, "Saved %d pages to EEPROM table.\n",
+ con->bad_page_num - control->ras_num_bad_pages);
+ data->count_saved = data->count;
}
return 0;
@@ -3096,17 +3105,17 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
}
}
+ ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
+ if (ret)
+ goto out;
+
ret = amdgpu_ras_eeprom_check(control);
if (ret)
goto out;
/* HW not usable */
- if (amdgpu_ras_is_rma(adev)) {
+ if (amdgpu_ras_is_rma(adev))
ret = -EHWPOISON;
- goto out;
- }
-
- ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
}
out:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 927d6bff734a..020245eb6aa0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -570,6 +570,8 @@ struct amdgpu_ras {
struct ras_event_manager *event_mgr;
uint64_t reserved_pages_in_bytes;
+
+ int bad_page_num;
};
struct ras_fs_data {
@@ -608,6 +610,7 @@ struct ras_err_handler_data {
struct eeprom_table_record *bps;
/* the count of entries */
int count;
+ int count_saved;
/* the space can place new entries */
int space_left;
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 54838746f97d..91e20e317cdd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -743,8 +743,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
else
control->ras_num_mca_recs += num;
- control->ras_num_bad_pages = control->ras_num_pa_recs +
- control->ras_num_mca_recs * adev->umc.retire_unit;
+ control->ras_num_bad_pages = con->bad_page_num;
Out:
kfree(buf);
return res;
@@ -1457,8 +1456,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
if (!__get_eeprom_i2c_addr(adev, control))
return -EINVAL;
- control->ras_num_bad_pages = control->ras_num_pa_recs +
- control->ras_num_mca_recs * adev->umc.retire_unit;
+ control->ras_num_bad_pages = ras->bad_page_num;
if (hdr->header == RAS_TABLE_HDR_VAL) {
dev_dbg(adev->dev,
--
2.34.1
More information about the amd-gfx
mailing list