[PATCH] drm/amdgpu: Skip poison aca bank from UE channel
Zhou1, Tao
Tao.Zhou1 at amd.com
Wed Jul 30 10:46:12 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
Better to add comment for the added condition check, with this resolved, the patch is:
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Xiang Liu
> Sent: Wednesday, July 30, 2025 5:25 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Liu, Xiang(Dean)
> <Xiang.Liu at amd.com>
> Subject: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
>
> Avoid GFX poison consumption errors logged when fatal error occurs.
>
> Signed-off-by: Xiang Liu <xiang.liu at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 47 ++++++++++++++-----------
> 1 file changed, 26 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> index 3835f2592914..59dbb9257096 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> @@ -125,6 +125,27 @@ static void aca_smu_bank_dump(struct amdgpu_device
> *adev, int idx, int total, st
> RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged
> by the scrubber\n"); }
>
> +static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> +aca_hwip_type type) {
> +
> + struct aca_hwip *hwip;
> + int hwid, mcatype;
> + u64 ipid;
> +
> + if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> + return false;
> +
> + hwip = &aca_hwid_mcatypes[type];
> + if (!hwip->hwid)
> + return false;
> +
> + ipid = bank->regs[ACA_REG_IDX_IPID];
> + hwid = ACA_REG__IPID__HARDWAREID(ipid);
> + mcatype = ACA_REG__IPID__MCATYPE(ipid);
> +
> + return hwip->hwid == hwid && hwip->mcatype == mcatype; }
> +
> static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_type type,
> int start, int count,
> struct aca_banks *banks, struct
> ras_query_context *qctx) @@ -163,6 +184,11 @@ static int
> aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
>
> bank.smu_err_type = type;
>
> + if (type == ACA_SMU_TYPE_UE &&
> +
> ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) &&
> + !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC))
> + continue;
> +
> aca_smu_bank_dump(adev, i, count, &bank, qctx);
>
> ret = aca_banks_add_bank(banks, &bank); @@ -173,27 +199,6 @@
> static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_
> return 0;
> }
>
> -static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> aca_hwip_type type) -{
> -
> - struct aca_hwip *hwip;
> - int hwid, mcatype;
> - u64 ipid;
> -
> - if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> - return false;
> -
> - hwip = &aca_hwid_mcatypes[type];
> - if (!hwip->hwid)
> - return false;
> -
> - ipid = bank->regs[ACA_REG_IDX_IPID];
> - hwid = ACA_REG__IPID__HARDWAREID(ipid);
> - mcatype = ACA_REG__IPID__MCATYPE(ipid);
> -
> - return hwip->hwid == hwid && hwip->mcatype == mcatype;
> -}
> -
> static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
> enum aca_smu_type type) {
> const struct aca_bank_ops *bank_ops = handle->bank_ops;
> --
> 2.34.1
More information about the amd-gfx
mailing list