[PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
Tao Zhou
tao.zhou1 at amd.com
Tue Oct 18 08:31:57 UTC 2022
Define page retirement functions for MCA platform.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 112 ++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 2 +
drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 2 +-
drivers/gpu/drm/amd/amdgpu/umc_v6_7.h | 2 +
4 files changed, 117 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index aad3c8b4c810..e97b1bd343ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -22,6 +22,118 @@
*/
#include "amdgpu.h"
+#include "umc_v6_7.h"
+
+static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
+ struct ras_err_data *err_data, uint64_t err_addr,
+ uint32_t ch_inst, uint32_t umc_inst)
+{
+ switch (adev->ip_versions[UMC_HWIP][0]) {
+ case IP_VERSION(6, 7, 0):
+ umc_v6_7_convert_error_address(adev,
+ err_data, err_addr, ch_inst, umc_inst);
+ break;
+ default:
+ dev_warn(adev->dev,
+ "UMC address to Physical address translation is not supported\n");
+ return AMDGPU_RAS_FAIL;
+ }
+
+ return AMDGPU_RAS_SUCCESS;
+}
+
+static int amdgpu_umc_ecc_info_query_error_address(struct amdgpu_device *adev,
+ struct ras_err_data *err_data)
+{
+ switch (adev->ip_versions[UMC_HWIP][0]) {
+ case IP_VERSION(6, 7, 0):
+ umc_v6_7_ecc_info_query_ras_error_address(adev,
+ err_data);
+ break;
+ default:
+ dev_warn(adev->dev,
+ "UMC error address query is not supported\n");
+ return AMDGPU_RAS_FAIL;
+ }
+
+ return AMDGPU_RAS_SUCCESS;
+}
+
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+ uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
+{
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+ int ret = AMDGPU_RAS_FAIL;
+
+ err_data.err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+ if (!err_data.err_addr) {
+ dev_warn(adev->dev,
+ "Failed to alloc memory for umc error record in MCA notifier!\n");
+ return AMDGPU_RAS_FAIL;
+ }
+
+ /*
+ * Translate UMC channel address to Physical address
+ */
+ ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
+ ch_inst, umc_inst);
+ if (ret)
+ goto out;
+
+ if (amdgpu_bad_page_threshold != 0) {
+ amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+ err_data.err_addr_cnt);
+ amdgpu_ras_save_bad_pages(adev);
+ }
+
+out:
+ kfree(err_data.err_addr);
+ return ret;
+}
+
+static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
+ struct ras_err_data *err_data, bool reset)
+{
+ int ret = AMDGPU_RAS_FAIL;
+
+ err_data->err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+ if (!err_data->err_addr) {
+ dev_warn(adev->dev,
+ "Failed to alloc memory for MCA RAS poison handler!\n");
+ goto out2;
+ }
+
+ /*
+ * Translate UMC channel address to Physical address
+ */
+ ret = amdgpu_umc_ecc_info_query_error_address(adev, err_data);
+ if (ret)
+ goto out1;
+
+ if (amdgpu_bad_page_threshold != 0) {
+ amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+ err_data->err_addr_cnt);
+ amdgpu_ras_save_bad_pages(adev);
+ }
+
+out1:
+ kfree(err_data->err_addr);
+out2:
+ /* trigger gpu reset even error count is 0 for CPU MCA RAS,
+ * MCA notifier is responsible for page retirement if error
+ * count can't be queried in poison handler.
+ */
+ if (reset) {
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ amdgpu_ras_reset_gpu(adev);
+ }
+
+ return ret;
+}
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 3629d8f292ef..659a10de29c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry);
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+ uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 72fd963f178b..b0da50d03af6 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -252,7 +252,7 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
}
}
-static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
+void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
index 105245d5b6e5..24382e9e5814 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
@@ -74,4 +74,6 @@ extern const uint32_t
void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst);
+void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
+ void *ras_error_status);
#endif
--
2.35.1
More information about the amd-gfx
mailing list