[PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
YiPeng Chai
YiPeng.Chai at amd.com
Thu Apr 18 02:58:31 UTC 2024
Retire bad pages for umc v12_0.
Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57 +++++++++++++++++++++++++-
1 file changed, 55 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6c2b61ef5b57..bd917eb6ea24 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -28,6 +28,8 @@
#include "umc/umc_12_0_0_sh_mask.h"
#include "mp/mp_13_0_6_sh_mask.h"
+#define MAX_ECC_NUM_PER_RETIREMENT 16
+
static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
uint32_t node_inst,
uint32_t umc_inst,
@@ -633,6 +635,58 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
return 0;
}
+static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
+ struct ras_ecc_err *ecc_err, void *ras_error_status)
+{
+ struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+ uint32_t i = 0;
+ int ret = 0;
+
+ if (!err_data || !ecc_err)
+ return -EINVAL;
+
+ for (i = 0; i < ecc_err->err_pages.count; i++) {
+ ret = amdgpu_umc_fill_error_record(err_data,
+ ecc_err->addr,
+ ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
+ MCA_IPID_2_UMC_CH(ecc_err->ipid),
+ MCA_IPID_2_UMC_INST(ecc_err->ipid));
+ if (ret)
+ break;
+ }
+
+ err_data->de_count++;
+
+ return ret;
+}
+
+static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
+ void *ras_error_status)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
+ struct radix_tree_root *ecc_tree;
+ int new_detected, ret, i;
+
+ ecc_tree = &con->umc_ecc_log.de_page_tree;
+
+ mutex_lock(&con->umc_ecc_log.lock);
+ new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
+ 0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG);
+ for (i = 0; i < new_detected; i++) {
+ if (!entries[i])
+ continue;
+
+ ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status);
+ if (ret) {
+ dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret);
+ break;
+ }
+ radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG);
+ }
+ mutex_unlock(&con->umc_ecc_log.lock);
+}
+
struct amdgpu_umc_ras umc_v12_0_ras = {
.ras_block = {
.hw_ops = &umc_v12_0_ras_hw_ops,
@@ -640,8 +694,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
},
.err_cnt_init = umc_v12_0_err_cnt_init,
.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
- .ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
- .ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
+ .ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,
.check_ecc_err_status = umc_v12_0_check_ecc_err_status,
.update_ecc_status = umc_v12_0_update_ecc_status,
};
--
2.34.1
More information about the amd-gfx
mailing list