[PATCH] drm/amdgpu: support saving bad pages after gpu ras reset
YiPeng Chai
YiPeng.Chai at amd.com
Wed Oct 18 07:27:10 UTC 2023
Support saving bad pages after gpu ras reset for umc_v12_0.
Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 +++++++++++++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 35 ++++++++++++++--------
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 7 +++++
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 29 ++++++++++++++++++
5 files changed, 95 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4e4ba2149595..c20c9d6df149 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1026,7 +1026,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
return -EINVAL;
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
- amdgpu_ras_get_ecc_info(adev, &err_data);
+ if (info->err_data) {
+ struct ras_err_data *ras_err = (struct ras_err_data *)info->err_data;
+
+ amdgpu_ras_get_ecc_info(adev, ras_err);
+ err_data.ce_count = ras_err->ce_count;
+ err_data.ue_count = ras_err->ue_count;
+ } else {
+ amdgpu_ras_get_ecc_info(adev, &err_data);
+ }
} else {
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
if (!block_obj || !block_obj->hw_ops) {
@@ -1889,6 +1897,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
.head = obj->head,
+ .err_data = NULL,
};
/*
@@ -1906,10 +1915,13 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
* info table failed temporarily.
* should be removed until smu fix handle ecc_info table.
*/
- if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
- (amdgpu_ip_version(adev, MP1_HWIP, 0) ==
- IP_VERSION(13, 0, 2)))
- continue;
+ if (info.head.block == AMDGPU_RAS_BLOCK__UMC) {
+ if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2))
+ continue;
+
+ if (adev->umc.err_data.err_addr)
+ info.err_data = &adev->umc.err_data;
+ }
amdgpu_ras_query_error_status(adev, &info);
@@ -2020,6 +2032,18 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
return ret;
}
+static void amdgpu_ras_data_save(struct amdgpu_device *adev)
+{
+ if (adev->umc.ras->ecc_data_save)
+ adev->umc.ras->ecc_data_save(adev);
+}
+
+static void amdgpu_ras_data_restore(struct amdgpu_device *adev)
+{
+ if (adev->umc.ras->ecc_data_restore)
+ adev->umc.ras->ecc_data_restore(adev);
+}
+
static void amdgpu_ras_do_recovery(struct work_struct *work)
{
struct amdgpu_ras *ras =
@@ -2042,6 +2066,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
list_for_each_entry(remote_adev,
device_list_handle, gmc.xgmi.head) {
+ amdgpu_ras_data_save(remote_adev);
amdgpu_ras_query_err_status(remote_adev);
amdgpu_ras_log_on_err_counter(remote_adev);
}
@@ -2080,6 +2105,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
}
+
+ if (device_list_handle)
+ list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head)
+ amdgpu_ras_data_restore(remote_adev);
+
atomic_set(&ras->in_recovery, 0);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7999d202c9bc..9ee53910a2c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -513,6 +513,7 @@ struct ras_query_if {
struct ras_common_if head;
unsigned long ue_count;
unsigned long ce_count;
+ void *err_data;
};
struct ras_inject_if {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 24fcc9a2e422..7542606e10fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -76,6 +76,27 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
return ret;
}
+void amdgpu_ras_handle_bad_pages(struct amdgpu_device *adev,
+ struct ras_err_data *err_data)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!err_data || !err_data->err_addr || !err_data->err_addr_cnt)
+ return;
+
+ amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+ err_data->err_addr_cnt);
+
+ amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));
+
+ amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
+
+ if (con->update_channel_flag == true) {
+ amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
+ con->update_channel_flag = false;
+ }
+}
+
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry,
@@ -144,18 +165,8 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
err_data->ue_count);
if ((amdgpu_bad_page_threshold != 0) &&
- err_data->err_addr_cnt) {
- amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
- err_data->err_addr_cnt);
- amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));
-
- amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
-
- if (con->update_channel_flag == true) {
- amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
- con->update_channel_flag = false;
- }
- }
+ err_data->err_addr_cnt)
+ amdgpu_ras_handle_bad_pages(adev, err_data);
if (reset)
amdgpu_ras_reset_gpu(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 417a6726c71b..447d8785008c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -66,6 +66,9 @@ struct amdgpu_umc_ras {
void *ras_error_status);
/* support different eeprom table version for different asic */
void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr);
+
+ void (*ecc_data_save)(struct amdgpu_device *adev);
+ void (*ecc_data_restore)(struct amdgpu_device *adev);
};
struct amdgpu_umc_funcs {
@@ -93,6 +96,7 @@ struct amdgpu_umc {
const struct amdgpu_umc_funcs *funcs;
struct amdgpu_umc_ras *ras;
+ struct ras_err_data err_data;
/* active mask for umc node instance */
unsigned long active_mask;
@@ -118,4 +122,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
umc_func func, void *data);
+
+void amdgpu_ras_handle_bad_pages(struct amdgpu_device *adev,
+ struct ras_err_data *err_data);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index c6742dd863d4..1fb78561f0fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -351,6 +351,33 @@ static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev)
return true;
}
+static void umc_v12_0_ecc_data_save(struct amdgpu_device *adev)
+{
+ adev->umc.err_data.err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+ adev->umc.err_data.ce_count = 0;
+ adev->umc.err_data.ue_count = 0;
+ adev->umc.err_data.err_addr_cnt = 0;
+}
+
+static void umc_v12_0_ecc_data_restore(struct amdgpu_device *adev)
+{
+ if (adev->umc.err_data.ue_count &&
+ adev->umc.err_data.err_addr_cnt &&
+ adev->umc.err_data.err_addr) {
+ amdgpu_ras_handle_bad_pages(adev, &adev->umc.err_data);
+ }
+
+ kfree(adev->umc.err_data.err_addr);
+
+ adev->umc.err_data.err_addr = NULL;
+ adev->umc.err_data.ce_count = 0;
+ adev->umc.err_data.ue_count = 0;
+ adev->umc.err_data.err_addr_cnt = 0;
+}
+
const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = {
.query_ras_error_count = umc_v12_0_query_ras_error_count,
.query_ras_error_address = umc_v12_0_query_ras_error_address,
@@ -362,4 +389,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
},
.err_cnt_init = umc_v12_0_err_cnt_init,
.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
+ .ecc_data_save = umc_v12_0_ecc_data_save,
+ .ecc_data_restore = umc_v12_0_ecc_data_restore,
};
--
2.34.1
More information about the amd-gfx
mailing list