[PATCH] drm/amdgpu: add ACA error query support for umc_v12_0
Wang, Yang(Kevin)
KevinYang.Wang at amd.com
Fri Apr 26 07:51:46 UTC 2024
[AMD Official Use Only - General]
Please ignore this patch, Thomas will submit a new patch to replace it.
Best Regards,
Kevin
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Friday, April 26, 2024 11:15 AM
To: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
Subject: RE: [PATCH] drm/amdgpu: add ACA error query support for umc_v12_0
[AMD Official Use Only - General]
> -----Original Message-----
> From: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
> Sent: Wednesday, April 17, 2024 11:10 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
> Subject: [PATCH] drm/amdgpu: add ACA error query support for umc_v12_0
>
> add ACA error query support for umc_v12_0.
>
> Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 +++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 ++++
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 18 ++++++++++++++----
> 3 files changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 352ce16a0963..46b7f0c5cd8a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1268,9 +1268,9 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device
> *adev, enum amdgpu_ras_block blk)
> return 0;
> }
>
> -static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev,
> enum amdgpu_ras_block blk,
> - enum aca_error_type type, struct
> ras_err_data *err_data,
> - struct ras_query_context *qctx)
> +int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk,
> + enum aca_error_type type, struct
> + ras_err_data
> *err_data,
> + struct ras_query_context *qctx)
> {
> struct ras_manager *obj;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 8d26989c75c8..487548879c49 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -898,6 +898,10 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device
> *adev, enum amdgpu_ras_block blk) ssize_t
> amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
> struct aca_handle *handle, char *buf,
> void *data);
>
> +int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk,
> + enum aca_error_type type, struct
> + ras_err_data
> *err_data,
> + struct ras_query_context *qctx);
[Tao] is it used in this patch?
> +
> void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info,
> struct ras_err_addr *err_addr);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index f69871902233..9f2c46814a4f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -317,16 +317,26 @@ static int
> umc_v12_0_err_cnt_init_per_channel(struct
> amdgpu_device *adev, static void
> umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
> void *ras_error_status) {
> + struct ras_err_data *err_data = (struct ras_err_data
> +*)ras_error_status;
> struct ras_query_context qctx;
>
> memset(&qctx, 0, sizeof(qctx));
> qctx.event_id = amdgpu_ras_acquire_event_id(adev,
> amdgpu_ras_intr_triggered() ?
> RAS_EVENT_TYPE_ISR :
> RAS_EVENT_TYPE_INVALID);
>
> - amdgpu_mca_smu_log_ras_error(adev,
> - AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status, &qctx);
> - amdgpu_mca_smu_log_ras_error(adev,
> - AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status, &qctx);
> + if (amdgpu_aca_is_enabled(adev)) {
> + amdgpu_aca_get_error_data(adev,
> AMDGPU_RAS_BLOCK__UMC, ACA_ERROR_TYPE_CE,
> + err_data, &qctx);
> + amdgpu_aca_get_error_data(adev,
> AMDGPU_RAS_BLOCK__UMC, ACA_ERROR_TYPE_UE,
> + err_data, &qctx);
> + amdgpu_aca_get_error_data(adev,
> AMDGPU_RAS_BLOCK__UMC, ACA_ERROR_TYPE_DEFERRED,
> + err_data, &qctx);
> + } else {
> + amdgpu_mca_smu_log_ras_error(adev,
> AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE,
> + err_data, &qctx);
> + amdgpu_mca_smu_log_ras_error(adev,
> AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE,
> + err_data, &qctx);
> + }
> }
>
> static void umc_v12_0_ecc_info_query_ras_error_address(struct
> amdgpu_device *adev,
> --
> 2.34.1
More information about the amd-gfx
mailing list