[PATCH Review 1/1] drm/amdgpu: Reset vram error data info

Stanley.Yang Stanley.Yang at amd.com
Wed Nov 1 10:34:49 UTC 2023


Reset error data info stored in vram  when user clear eeprom table.

Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 97 ++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  2 +
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    |  4 +
 3 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 753260745554..9c1072ea5760 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2336,6 +2336,77 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 	return ret;
 }
 
+int amdgpu_ras_error_data_init(struct ras_err_data *err_data)
+{
+	memset(err_data, 0, sizeof(*err_data));
+
+	INIT_LIST_HEAD(&err_data->err_node_list);
+
+	return 0;
+}
+
+static void amdgpu_ras_error_node_release(struct ras_err_node *err_node)
+{
+	if (!err_node)
+		return;
+
+	list_del(&err_node->node);
+	kvfree(err_node);
+}
+
+void amdgpu_ras_error_data_fini(struct ras_err_data *err_data)
+{
+	struct ras_err_node *err_node, *tmp;
+
+	list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node)
+		amdgpu_ras_error_node_release(err_node);
+}
+
+static void amdgpu_ras_reset_error_info(struct ras_manager *obj)
+{
+	struct ras_err_data *err_data;
+
+	if (!obj)
+		return;
+
+	err_data = &obj->err_data;
+
+	/* release all error nodes */
+	amdgpu_ras_error_data_fini(err_data);
+
+	/* reset error data and init */
+	amdgpu_ras_error_data_init(err_data);
+}
+
+/* reset vram bad pages data and umc ras manager error count */
+int amdgpu_ras_reset_vram_bad_pages(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	struct ras_manager *obj;
+
+	if (!con || !con->eh_data)
+		return 0;
+
+	mutex_lock(&con->recovery_lock);
+
+	data = con->eh_data;
+	data->space_left += data->count;
+	data->count = 0;
+	memset(data->bps, 0, data->space_left * sizeof(data->bps));
+
+	mutex_unlock(&con->recovery_lock);
+
+	list_for_each_entry(obj, &con->head, node) {
+		if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) {
+			amdgpu_ras_reset_error_info(obj);
+			break;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * write error record array to eeprom, the function should be
  * protected by recovery_lock
@@ -3556,32 +3627,6 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
 	}
 }
 
-int amdgpu_ras_error_data_init(struct ras_err_data *err_data)
-{
-	memset(err_data, 0, sizeof(*err_data));
-
-	INIT_LIST_HEAD(&err_data->err_node_list);
-
-	return 0;
-}
-
-static void amdgpu_ras_error_node_release(struct ras_err_node *err_node)
-{
-	if (!err_node)
-		return;
-
-	list_del(&err_node->node);
-	kvfree(err_node);
-}
-
-void amdgpu_ras_error_data_fini(struct ras_err_data *err_data)
-{
-	struct ras_err_node *err_node, *tmp;
-
-	list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node)
-		amdgpu_ras_error_node_release(err_node);
-}
-
 static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data,
 							     struct amdgpu_smuio_mcm_config_info *mcm_info)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 665414c22ca9..64710517b9fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -622,6 +622,8 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
 		unsigned long *new_cnt);
 
+int amdgpu_ras_reset_vram_bad_pages(struct amdgpu_device *adev);
+
 static inline enum ta_ras_block
 amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
 	switch (block) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 65aa218380be..40060f1b8ad6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -462,6 +462,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 
 	mutex_unlock(&control->ras_tbl_mutex);
 
+	/* reset dad pages in vram structure */
+	if (amdgpu_ras_reset_vram_bad_pages(adev))
+		dev_warn(adev->dev, "reset vram bad pages structure failed, need reboot system\n");
+
 	return res;
 }
 
-- 
2.25.1



More information about the amd-gfx mailing list