[PATCH] drm/amdgpu: move convert_error_address out of umc_ras

Yang, Stanley Stanley.Yang at amd.com
Fri Oct 14 06:35:43 UTC 2022


[AMD Official Use Only - General]

Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>

Regards,
Stanley
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
> Hawking Zhang
> Sent: Friday, October 14, 2022 2:19 PM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>;
> Yang, Stanley <Stanley.Yang at amd.com>
> Cc: Russell, Kent <Kent.Russell at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: move convert_error_address out of umc_ras
> 
> RAS error address translation algorithm is common across dGPU and A + A
> platform as along as the SOC integrates the same generation of UMC IP.
> 
> UMC RAS is managed by x86 MCA on A + A platform, umc_ras in GPU driver
> is not initialized at all on A + A platform. In such case, any umc_ras callback
> implemented for dGPU config shouldn't be invoked from A + A specific
> callback.
> 
> The change moves convert_error_address out of dGPU umc_ras structure
> and makes it share between A + A and dGPU config.
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 +++++++++++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  3 ---
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   |  7 +++----
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.h   |  4 +++-
>  4 files changed, 17 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 75f1402101f4..ff92ea99d513 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -36,6 +36,7 @@
>  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>  #include "atom.h"
>  #include "amdgpu_reset.h"
> +#include "umc_v6_7.h"
> 
>  #ifdef CONFIG_X86_MCE_AMD
>  #include <asm/mce.h>
> @@ -2885,10 +2886,16 @@ static int amdgpu_bad_page_notifier(struct
> notifier_block *nb,
>  	/*
>  	 * Translate UMC channel address to Physical address
>  	 */
> -	if (adev->umc.ras &&
> -	    adev->umc.ras->convert_ras_error_address)
> -		adev->umc.ras->convert_ras_error_address(adev,
> -			&err_data, m->addr, ch_inst, umc_inst);
> +	switch (adev->ip_versions[UMC_HWIP][0]) {
> +	case IP_VERSION(6, 7, 0):
> +		umc_v6_7_convert_error_address(adev,
> +				&err_data, m->addr, ch_inst, umc_inst);
> +		break;
> +	default:
> +		dev_warn(adev->dev,
> +			 "UMC address to Physical address translation is not
> supported\n");
> +		return NOTIFY_DONE;
> +	}
> 
>  	if (amdgpu_bad_page_threshold != 0) {
>  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr, diff --
> git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index e46439274f3a..3629d8f292ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -51,9 +51,6 @@ struct amdgpu_umc_ras {
>  	struct amdgpu_ras_block_object ras_block;
>  	void (*err_cnt_init)(struct amdgpu_device *adev);
>  	bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
> -	void (*convert_ras_error_address)(struct amdgpu_device *adev,
> -				struct ras_err_data *err_data, uint64_t
> err_addr,
> -				uint32_t ch_inst, uint32_t umc_inst);
>  	void (*ecc_info_query_ras_error_count)(struct amdgpu_device
> *adev,
>  				      void *ras_error_status);
>  	void (*ecc_info_query_ras_error_address)(struct amdgpu_device
> *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> index 5d5d031c9e7d..72fd963f178b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> @@ -187,9 +187,9 @@ static void
> umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
>  	}
>  }
> 
> -static void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
> -					struct ras_err_data *err_data,
> uint64_t err_addr,
> -					uint32_t ch_inst, uint32_t umc_inst)
> +void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
> +				    struct ras_err_data *err_data, uint64_t
> err_addr,
> +				    uint32_t ch_inst, uint32_t umc_inst)
>  {
>  	uint32_t channel_index;
>  	uint64_t soc_pa, retired_page, column; @@ -553,5 +553,4 @@ struct
> amdgpu_umc_ras umc_v6_7_ras = {
>  	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
>  	.ecc_info_query_ras_error_count =
> umc_v6_7_ecc_info_query_ras_error_count,
>  	.ecc_info_query_ras_error_address =
> umc_v6_7_ecc_info_query_ras_error_address,
> -	.convert_ras_error_address = umc_v6_7_convert_error_address,
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> index fe41ed2f5945..105245d5b6e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> @@ -71,5 +71,7 @@ extern const uint32_t
> 
> 	umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NU
> M][UMC_V6_7_CHANNEL_INSTANCE_NUM];
>  extern const uint32_t
> 
> 	umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM]
> [UMC_V6_7_CHANNEL_INSTANCE_NUM];
> -
> +void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
> +                                    struct ras_err_data *err_data, uint64_t err_addr,
> +                                    uint32_t ch_inst, uint32_t
> +umc_inst);
>  #endif
> --
> 2.17.1


More information about the amd-gfx mailing list