[PATCH] drm/amdgpu: Use correct aca handle to validate aca bank
Xiang Liu
xiang.liu at amd.com
Tue Mar 18 09:15:13 UTC 2025
The aca handle is introduced by upper caller, it's inappropriate to
poll aca handle to match and validate aca bank, which will cause
unexcepted ras error report.
Signed-off-by: Xiang Liu <xiang.liu at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 122 ++++++++++--------------
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 2 +-
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 10 +-
3 files changed, 58 insertions(+), 76 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index ffd4c64e123c..b07e101c545d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -122,6 +122,25 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st
idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
}
+static bool aca_bank_should_dump(struct amdgpu_device *adev, enum aca_smu_type type)
+{
+ struct amdgpu_aca *aca = &adev->aca;
+ bool ret = true;
+
+ /*
+ * Because the UE Valid MCA count will only be cleared after reset,
+ * the aca bank is only dumped once during the gpu recovery stage.
+ */
+ if (type == ACA_SMU_TYPE_UE) {
+ if (amdgpu_ras_intr_triggered())
+ ret = atomic_cmpxchg(&aca->ue_dump_flag, 0, 1) == 0;
+ else
+ atomic_set(&aca->ue_dump_flag, 0);
+ }
+
+ return ret;
+}
+
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
int start, int count,
struct aca_banks *banks, struct ras_query_context *qctx)
@@ -130,6 +149,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
struct aca_bank bank;
int i, max_count, ret;
+ struct aca_bank_node *node;
if (!count)
return 0;
@@ -159,14 +179,16 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
return ret;
bank.smu_err_type = type;
-
- aca_smu_bank_dump(adev, i, count, &bank, qctx);
-
ret = aca_banks_add_bank(banks, &bank);
if (ret)
return ret;
}
+ i = 0;
+ if (aca_bank_should_dump(adev, type))
+ list_for_each_entry(node, &banks->list, node)
+ aca_smu_bank_dump(adev, i++, count, &bank, qctx);
+
return 0;
}
@@ -318,72 +340,29 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank
return 0;
}
-static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
- enum aca_smu_type type, bank_handler_t handler, void *data)
-{
- struct aca_handle *handle;
- int ret;
-
- if (list_empty(&mgr->list))
- return 0;
-
- list_for_each_entry(handle, &mgr->list, node) {
- if (!aca_bank_is_valid(handle, bank, type))
- continue;
-
- ret = handler(handle, bank, type, data);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
+static int aca_dispatch_banks(struct aca_handle *handle, struct aca_banks *banks,
enum aca_smu_type type, bank_handler_t handler, void *data)
{
struct aca_bank_node *node;
struct aca_bank *bank;
- int ret;
- if (!mgr || !banks)
+ if (!handle || !banks)
return -EINVAL;
/* pre check to avoid unnecessary operations */
- if (list_empty(&mgr->list) || list_empty(&banks->list))
+ if (list_empty(&banks->list))
return 0;
list_for_each_entry(node, &banks->list, node) {
bank = &node->bank;
- ret = aca_dispatch_bank(mgr, bank, type, handler, data);
- if (ret)
- return ret;
+ if (aca_bank_is_valid(handle, bank, type))
+ handler(handle, bank, type, data);
}
return 0;
}
-static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type)
-{
- struct amdgpu_aca *aca = &adev->aca;
- bool ret = true;
-
- /*
- * Because the UE Valid MCA count will only be cleared after reset,
- * in order to avoid repeated counting of the error count,
- * the aca bank is only updated once during the gpu recovery stage.
- */
- if (type == ACA_SMU_TYPE_UE) {
- if (amdgpu_ras_intr_triggered())
- ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0;
- else
- atomic_set(&aca->ue_update_flag, 0);
- }
-
- return ret;
-}
-
static void aca_banks_generate_cper(struct amdgpu_device *adev,
enum aca_smu_type type,
struct aca_banks *banks,
@@ -417,20 +396,14 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
}
}
-static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
- bank_handler_t handler, struct ras_query_context *qctx, void *data)
+static int aca_banks_update(struct amdgpu_device *adev, struct aca_handle *handle,
+ enum aca_smu_type type, bank_handler_t handler,
+ struct ras_query_context *qctx, void *data)
{
- struct amdgpu_aca *aca = &adev->aca;
struct aca_banks banks;
u32 count = 0;
int ret;
- if (list_empty(&aca->mgr.list))
- return 0;
-
- if (!aca_bank_should_update(adev, type))
- return 0;
-
ret = aca_smu_get_valid_aca_count(adev, type, &count);
if (ret)
return ret;
@@ -442,15 +415,12 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);
if (ret)
- goto err_release_banks;
+ return ret;
- if (list_empty(&banks.list)) {
- ret = 0;
- goto err_release_banks;
- }
+ if (list_empty(&banks.list))
+ return 0;
- ret = aca_dispatch_banks(&aca->mgr, &banks, type,
- handler, data);
+ ret = aca_dispatch_banks(handle, &banks, type, handler, data);
if (ret)
goto err_release_banks;
@@ -537,7 +507,7 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h
}
/* update aca bank to aca source error_cache first */
- ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);
+ ret = aca_banks_update(adev, handle, smu_type, handler_aca_log_bank_error, qctx, NULL);
if (ret)
return ret;
@@ -730,7 +700,7 @@ int amdgpu_aca_init(struct amdgpu_device *adev)
struct amdgpu_aca *aca = &adev->aca;
int ret;
- atomic_set(&aca->ue_update_flag, 0);
+ atomic_set(&aca->ue_dump_flag, 0);
ret = aca_manager_init(&aca->mgr);
if (ret)
@@ -745,14 +715,14 @@ void amdgpu_aca_fini(struct amdgpu_device *adev)
aca_manager_fini(&aca->mgr);
- atomic_set(&aca->ue_update_flag, 0);
+ atomic_set(&aca->ue_dump_flag, 0);
}
int amdgpu_aca_reset(struct amdgpu_device *adev)
{
struct amdgpu_aca *aca = &adev->aca;
- atomic_set(&aca->ue_update_flag, 0);
+ atomic_set(&aca->ue_dump_flag, 0);
return 0;
}
@@ -880,12 +850,20 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban
static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)
{
struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
+ struct aca_handle_manager *mgr = &adev->aca.mgr;
+ struct aca_handle *handle;
struct aca_dump_context context = {
.m = m,
.idx = 0,
};
- return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);
+ if (list_empty(&mgr->list))
+ return 0;
+
+ list_for_each_entry(handle, &mgr->list, node)
+ aca_banks_update(adev, handle, type, handler_aca_bank_dump, NULL, (void *)&context);
+
+ return 0;
}
static int aca_dump_ce_show(struct seq_file *m, void *unused)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index 6f62e5d80ed6..e71d6f5afaec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -202,7 +202,7 @@ struct aca_smu_funcs {
struct amdgpu_aca {
struct aca_handle_manager mgr;
const struct aca_smu_funcs *smu_funcs;
- atomic_t ue_update_flag;
+ atomic_t ue_dump_flag;
bool is_enabled;
};
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index c0de682b7774..a4038e92c59e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -876,10 +876,14 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
void *data)
{
struct aca_bank_info info;
- u64 misc0;
+ u64 misc0, status;
u32 instlo;
int ret;
+ status = bank->regs[ACA_REG_IDX_STATUS];
+ if (!ACA_REG__STATUS__VAL(status))
+ return 0;
+
ret = aca_bank_info_decode(bank, &info);
if (ret)
return ret;
@@ -894,8 +898,8 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
switch (type) {
case ACA_SMU_TYPE_UE:
bank->aca_err_type = ACA_ERROR_TYPE_UE;
- ret = aca_error_cache_log_bank_error(handle, &info,
- ACA_ERROR_TYPE_UE, 1ULL);
+ if (ACA_REG__STATUS__UC(status) && ACA_REG__STATUS__PCC(status))
+ ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, 1);
break;
case ACA_SMU_TYPE_CE:
bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
--
2.34.1
More information about the amd-gfx
mailing list