[PATCH 09/11] drm/amdgpu: Rework xgmi_wafl_pcs ras sw_init

Yang, Stanley Stanley.Yang at amd.com
Mon Mar 6 07:28:20 UTC 2023



> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Monday, March 6, 2023 10:32 AM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>;
> Yang, Stanley <Stanley.Yang at amd.com>; Li, Candice <Candice.Li at amd.com>;
> Chai, Thomas <YiPeng.Chai at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH 09/11] drm/amdgpu: Rework xgmi_wafl_pcs ras sw_init
> 
> To align with other IP blocks.
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  9 +++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28
> +++++++++++++++++++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  1 +
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  7 ++++++
>  4 files changed, 37 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 524e2c9b3012..d4685d22be60 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -500,9 +500,12 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device
> *adev)
> 
>  	/* xgmi ras block */
>  	if (amdgpu_ras_is_supported(adev,
> AMDGPU_RAS_BLOCK__XGMI_WAFL)) {
> -		adev->gmc.xgmi.ras = &xgmi_ras;
> -		amdgpu_ras_register_ras_block(adev, &adev-
> >gmc.xgmi.ras->ras_block);
> -		adev->gmc.xgmi.ras_if = &adev->gmc.xgmi.ras-
> >ras_block.ras_comm;
> +		r = amdgpu_xgmi_ras_sw_init(adev);
> +		if (r) {
> +			dev_err(adev->dev, "Failed to initialize
> xgmi_wafl_pcs ras block!\n");
> +			return r;
> +		}
> +
>  	}
> 
>  	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index fef1575cd0cf..3fe24348d199 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -1048,12 +1048,30 @@ struct amdgpu_ras_block_hw_ops
> xgmi_ras_hw_ops = {
> 
>  struct amdgpu_xgmi_ras xgmi_ras = {
>  	.ras_block = {
> -		.ras_comm = {
> -			.name = "xgmi_wafl",
> -			.block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
> -			.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
> -		},
>  		.hw_ops = &xgmi_ras_hw_ops,
>  		.ras_late_init = amdgpu_xgmi_ras_late_init,
>  	},
>  };
> +
> +int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev) {
> +	int err;
> +	struct amdgpu_xgmi_ras *ras;
> +
> +	if (!adev->gmc.xgmi.ras)
> +		return 0;
> +
> +	ras = adev->gmc.xgmi.ras;
> +	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
> +	if (err) {
> +		dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras
> block!\n");
> +		return err;
> +	}
> +
> +	strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl_pcs");
> +	ras->ras_block.ras_comm.block =
> AMDGPU_RAS_BLOCK__XGMI_WAFL;
> +	ras->ras_block.ras_comm.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> +	adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm;
> +
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 30dcc1681b4e..86fbf56938f4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -73,5 +73,6 @@ static inline bool amdgpu_xgmi_same_hive(struct
> amdgpu_device *adev,
>  		adev->gmc.xgmi.hive_id &&
>  		adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);  }
> +int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev);
> 
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 67c2a5186b8a..2a8dc9b52c2d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1381,6 +1381,12 @@ static void gmc_v9_0_set_mca_ras_funcs(struct
> amdgpu_device *adev)
>  	}
>  }
> 
> +static void gmc_v9_0_set_xgmi_ras_funcs(struct amdgpu_device *adev) {
> +	if (!adev->gmc.xgmi.connected_to_cpu)

[Stanley]: Can we use if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__ XGMI_WAFL)) instead of if (!adev->gmc.xgmi.connected_to_cpu)
	    to keep the ip ras judgment uniform.

Regards,
Stanley
> +		adev->gmc.xgmi.ras = &xgmi_ras;
> +}
> +
>  static int gmc_v9_0_early_init(void *handle)  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -1404,6 +1410,7 @@ static int gmc_v9_0_early_init(void *handle)
>  	gmc_v9_0_set_gfxhub_funcs(adev);
>  	gmc_v9_0_set_hdp_ras_funcs(adev);
>  	gmc_v9_0_set_mca_ras_funcs(adev);
> +	gmc_v9_0_set_xgmi_ras_funcs(adev);
> 
>  	adev->gmc.shared_aperture_start = 0x2000000000000000ULL;
>  	adev->gmc.shared_aperture_end =
> --
> 2.17.1



More information about the amd-gfx mailing list