[PATCH 3/4] drm/amdgpu: save umc error records
Tao Zhou
tao.zhou1 at amd.com
Fri Aug 30 12:24:52 UTC 2019
save umc error records to ras bad page array
v2: add bad pages before gpu reset
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +-
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 29 ++++++++++++++----
drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 39 ++++++++++++++++++++-----
3 files changed, 57 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b6bac873c588..42e1d379e21c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -347,7 +347,7 @@ struct ras_err_data {
unsigned long ue_count;
unsigned long ce_count;
unsigned long err_addr_cnt;
- uint64_t *err_addr;
+ struct eeprom_table_record *err_addr;
};
struct ras_err_handler_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 70a05e302d60..5015a07dcb3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -246,16 +246,35 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (adev->umc.funcs->query_ras_error_count)
adev->umc.funcs->query_ras_error_count(adev, err_data);
- /* umc query_ras_error_address is also responsible for clearing
- * error status
- */
- if (adev->umc.funcs->query_ras_error_address)
+
+ if (adev->umc.funcs->query_ras_error_address &&
+ adev->umc.max_ras_err_cnt_per_query) {
+ err_data->err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+ /* still call query_ras_error_address to clear error status
+ * even NOMEM error is encountered
+ */
+ if(!err_data->err_addr)
+ DRM_WARN("Failed to alloc memory for umc error address record!\n");
+
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
adev->umc.funcs->query_ras_error_address(adev, err_data);
+ }
/* only uncorrectable error needs gpu reset */
- if (err_data->ue_count)
+ if (err_data->ue_count) {
+ if (err_data->err_addr_cnt &&
+ amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+ err_data->err_addr_cnt))
+ DRM_WARN("Failed to add ras bad page!\n");
+
amdgpu_ras_reset_gpu(adev, 0);
+ }
+ kfree(err_data->err_addr);
return AMDGPU_RAS_SUCCESS;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
index 8502e736f721..a54ff170591f 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
@@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
RSMU_UMC_INDEX_MODE_EN, 0);
}
+static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev)
+{
+ uint32_t rsmu_umc_index;
+
+ rsmu_umc_index = RREG32_SOC15(RSMU, 0,
+ mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
+ return REG_GET_FIELD(rsmu_umc_index,
+ RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
+ RSMU_UMC_INDEX_INSTANCE);
+}
+
static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
uint32_t umc_reg_offset,
unsigned long *error_count)
@@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
uint32_t umc_reg_offset, uint32_t channel_index)
{
uint32_t lsb, mc_umc_status_addr;
- uint64_t mc_umc_status, err_addr;
+ uint64_t mc_umc_status, err_addr, retired_page;
+ struct eeprom_table_record *err_rec;
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
return;
}
+ err_rec = &err_data->err_addr[err_data->err_addr_cnt];
mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset);
/* calculate error address if ue/ce error is detected */
@@ -191,12 +204,24 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
err_addr &= ~((0x1ULL << lsb) - 1);
/* translate umc channel address to soc pa, 3 parts are included */
- err_data->err_addr[err_data->err_addr_cnt] =
- ADDR_OF_8KB_BLOCK(err_addr) |
- ADDR_OF_256B_BLOCK(channel_index) |
- OFFSET_IN_256B_BLOCK(err_addr);
-
- err_data->err_addr_cnt++;
+ retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
+ ADDR_OF_256B_BLOCK(channel_index) |
+ OFFSET_IN_256B_BLOCK(err_addr);
+
+ /* we only save ue error information currently, ce is skipped */
+ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
+ == 1) {
+ err_rec->address = err_addr;
+ /* page frame address is saved */
+ err_rec->retired_page = retired_page >> PAGE_SHIFT;
+ err_rec->ts = (uint64_t)ktime_get_real_seconds();
+ err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+ err_rec->cu = 0;
+ err_rec->mem_channel = channel_index;
+ err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev);
+
+ err_data->err_addr_cnt++;
+ }
}
/* clear umc status */
--
2.17.1
More information about the amd-gfx
mailing list