[PATCH Review 1/1] drm/amdgpu: Reset vram error data info
Stanley.Yang
Stanley.Yang at amd.com
Wed Nov 1 10:34:49 UTC 2023
Reset error data info stored in vram when user clear eeprom table.
Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 97 ++++++++++++++-----
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 4 +
3 files changed, 77 insertions(+), 26 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 753260745554..9c1072ea5760 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2336,6 +2336,77 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
return ret;
}
+int amdgpu_ras_error_data_init(struct ras_err_data *err_data)
+{
+ memset(err_data, 0, sizeof(*err_data));
+
+ INIT_LIST_HEAD(&err_data->err_node_list);
+
+ return 0;
+}
+
+static void amdgpu_ras_error_node_release(struct ras_err_node *err_node)
+{
+ if (!err_node)
+ return;
+
+ list_del(&err_node->node);
+ kvfree(err_node);
+}
+
+void amdgpu_ras_error_data_fini(struct ras_err_data *err_data)
+{
+ struct ras_err_node *err_node, *tmp;
+
+ list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node)
+ amdgpu_ras_error_node_release(err_node);
+}
+
+static void amdgpu_ras_reset_error_info(struct ras_manager *obj)
+{
+ struct ras_err_data *err_data;
+
+ if (!obj)
+ return;
+
+ err_data = &obj->err_data;
+
+ /* release all error nodes */
+ amdgpu_ras_error_data_fini(err_data);
+
+ /* reset error data and init */
+ amdgpu_ras_error_data_init(err_data);
+}
+
+/* reset vram bad pages data and umc ras manager error count */
+int amdgpu_ras_reset_vram_bad_pages(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data *data;
+ struct ras_manager *obj;
+
+ if (!con || !con->eh_data)
+ return 0;
+
+ mutex_lock(&con->recovery_lock);
+
+ data = con->eh_data;
+ data->space_left += data->count;
+ data->count = 0;
+ memset(data->bps, 0, data->space_left * sizeof(data->bps));
+
+ mutex_unlock(&con->recovery_lock);
+
+ list_for_each_entry(obj, &con->head, node) {
+ if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) {
+ amdgpu_ras_reset_error_info(obj);
+ break;
+ }
+ }
+
+ return 0;
+}
+
/*
* write error record array to eeprom, the function should be
* protected by recovery_lock
@@ -3556,32 +3627,6 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
}
}
-int amdgpu_ras_error_data_init(struct ras_err_data *err_data)
-{
- memset(err_data, 0, sizeof(*err_data));
-
- INIT_LIST_HEAD(&err_data->err_node_list);
-
- return 0;
-}
-
-static void amdgpu_ras_error_node_release(struct ras_err_node *err_node)
-{
- if (!err_node)
- return;
-
- list_del(&err_node->node);
- kvfree(err_node);
-}
-
-void amdgpu_ras_error_data_fini(struct ras_err_data *err_data)
-{
- struct ras_err_node *err_node, *tmp;
-
- list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node)
- amdgpu_ras_error_node_release(err_node);
-}
-
static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 665414c22ca9..64710517b9fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -622,6 +622,8 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
unsigned long *new_cnt);
+int amdgpu_ras_reset_vram_bad_pages(struct amdgpu_device *adev);
+
static inline enum ta_ras_block
amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
switch (block) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 65aa218380be..40060f1b8ad6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -462,6 +462,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
mutex_unlock(&control->ras_tbl_mutex);
+ /* reset dad pages in vram structure */
+ if (amdgpu_ras_reset_vram_bad_pages(adev))
+ dev_warn(adev->dev, "reset vram bad pages structure failed, need reboot system\n");
+
return res;
}
--
2.25.1
More information about the amd-gfx
mailing list