[PATCH] drm/amdgpu: move convert_error_address out of umc_ras
Yang, Stanley
Stanley.Yang at amd.com
Fri Oct 14 06:35:43 UTC 2022
[AMD Official Use Only - General]
Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>
Regards,
Stanley
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
> Hawking Zhang
> Sent: Friday, October 14, 2022 2:19 PM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>;
> Yang, Stanley <Stanley.Yang at amd.com>
> Cc: Russell, Kent <Kent.Russell at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: move convert_error_address out of umc_ras
>
> RAS error address translation algorithm is common across dGPU and A + A
> platform as along as the SOC integrates the same generation of UMC IP.
>
> UMC RAS is managed by x86 MCA on A + A platform, umc_ras in GPU driver
> is not initialized at all on A + A platform. In such case, any umc_ras callback
> implemented for dGPU config shouldn't be invoked from A + A specific
> callback.
>
> The change moves convert_error_address out of dGPU umc_ras structure
> and makes it share between A + A and dGPU config.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 +++++++++++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 ---
> drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 7 +++----
> drivers/gpu/drm/amd/amdgpu/umc_v6_7.h | 4 +++-
> 4 files changed, 17 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 75f1402101f4..ff92ea99d513 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -36,6 +36,7 @@
> #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
> #include "atom.h"
> #include "amdgpu_reset.h"
> +#include "umc_v6_7.h"
>
> #ifdef CONFIG_X86_MCE_AMD
> #include <asm/mce.h>
> @@ -2885,10 +2886,16 @@ static int amdgpu_bad_page_notifier(struct
> notifier_block *nb,
> /*
> * Translate UMC channel address to Physical address
> */
> - if (adev->umc.ras &&
> - adev->umc.ras->convert_ras_error_address)
> - adev->umc.ras->convert_ras_error_address(adev,
> - &err_data, m->addr, ch_inst, umc_inst);
> + switch (adev->ip_versions[UMC_HWIP][0]) {
> + case IP_VERSION(6, 7, 0):
> + umc_v6_7_convert_error_address(adev,
> + &err_data, m->addr, ch_inst, umc_inst);
> + break;
> + default:
> + dev_warn(adev->dev,
> + "UMC address to Physical address translation is not
> supported\n");
> + return NOTIFY_DONE;
> + }
>
> if (amdgpu_bad_page_threshold != 0) {
> amdgpu_ras_add_bad_pages(adev, err_data.err_addr, diff --
> git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index e46439274f3a..3629d8f292ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -51,9 +51,6 @@ struct amdgpu_umc_ras {
> struct amdgpu_ras_block_object ras_block;
> void (*err_cnt_init)(struct amdgpu_device *adev);
> bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
> - void (*convert_ras_error_address)(struct amdgpu_device *adev,
> - struct ras_err_data *err_data, uint64_t
> err_addr,
> - uint32_t ch_inst, uint32_t umc_inst);
> void (*ecc_info_query_ras_error_count)(struct amdgpu_device
> *adev,
> void *ras_error_status);
> void (*ecc_info_query_ras_error_address)(struct amdgpu_device
> *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> index 5d5d031c9e7d..72fd963f178b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> @@ -187,9 +187,9 @@ static void
> umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
> }
> }
>
> -static void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
> - struct ras_err_data *err_data,
> uint64_t err_addr,
> - uint32_t ch_inst, uint32_t umc_inst)
> +void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
> + struct ras_err_data *err_data, uint64_t
> err_addr,
> + uint32_t ch_inst, uint32_t umc_inst)
> {
> uint32_t channel_index;
> uint64_t soc_pa, retired_page, column; @@ -553,5 +553,4 @@ struct
> amdgpu_umc_ras umc_v6_7_ras = {
> .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
> .ecc_info_query_ras_error_count =
> umc_v6_7_ecc_info_query_ras_error_count,
> .ecc_info_query_ras_error_address =
> umc_v6_7_ecc_info_query_ras_error_address,
> - .convert_ras_error_address = umc_v6_7_convert_error_address,
> };
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> index fe41ed2f5945..105245d5b6e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> @@ -71,5 +71,7 @@ extern const uint32_t
>
> umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NU
> M][UMC_V6_7_CHANNEL_INSTANCE_NUM];
> extern const uint32_t
>
> umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM]
> [UMC_V6_7_CHANNEL_INSTANCE_NUM];
> -
> +void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
> + struct ras_err_data *err_data, uint64_t err_addr,
> + uint32_t ch_inst, uint32_t
> +umc_inst);
> #endif
> --
> 2.17.1
More information about the amd-gfx
mailing list