[PATCH 08/11] drm/amdgpu: Rework mca ras sw_init

Yang, Stanley Stanley.Yang at amd.com
Mon Mar 6 07:24:53 UTC 2023



> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Monday, March 6, 2023 10:32 AM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>;
> Yang, Stanley <Stanley.Yang at amd.com>; Li, Candice <Candice.Li at amd.com>;
> Chai, Thomas <YiPeng.Chai at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH 08/11] drm/amdgpu: Rework mca ras sw_init
> 
> To align with other IP blocks
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 21 ++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 72
> +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h |  9 ++--
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 15 +++---
>  drivers/gpu/drm/amd/amdgpu/mca_v3_0.c   | 44 ++-------------
>  drivers/gpu/drm/amd/amdgpu/mca_v3_0.h   |  4 +-
>  6 files changed, 111 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 087a75374610..524e2c9b3012 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -477,6 +477,27 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device
> *adev)
>  		}
>  	}
> 
> +	/* mca.x ras block */
> +	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MCA))
> {
> +		r = amdgpu_mca_mp0_ras_sw_init(adev);
> +		if (r) {
> +			dev_err(adev->dev, "Failed to initialize mca.mp0 ras
> block!\n");
> +			return r;
> +		}
> +
> +		r = amdgpu_mca_mp1_ras_sw_init(adev);
> +		if (r) {
> +			dev_err(adev->dev, "Failed to initialize mca.mp1 ras
> block!\n");
> +			return r;
> +		}
> +
> +		r = amdgpu_mca_mpio_ras_sw_init(adev);
> +		if (r) {
> +			dev_err(adev->dev, "Failed to initialize mca.mpio ras
> block!\n");
> +			return r;
> +		}
> +	}
> +
>  	/* xgmi ras block */
>  	if (amdgpu_ras_is_supported(adev,
> AMDGPU_RAS_BLOCK__XGMI_WAFL)) {
>  		adev->gmc.xgmi.ras = &xgmi_ras;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> index 51c2a82e2fa4..0b545bdcd636 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> @@ -70,3 +70,75 @@ void amdgpu_mca_query_ras_error_count(struct
> amdgpu_device *adev,
> 
>  	amdgpu_mca_reset_error_count(adev, mc_status_addr);  }
> +
> +int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev) {
> +	int err;
> +	struct amdgpu_mca_ras_block *ras;
> +
> +	if (!adev->mca.mp0.ras)
> +		return 0;
> +
> +	ras = adev->mca.mp0.ras;
> +
> +	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
> +	if (err) {
> +		dev_err(adev->dev, "Failed to register mca.mp0 ras
> block!\n");
> +		return err;
> +	}
> +
> +	strcpy(ras->ras_block.ras_comm.name, "mca.mp0");
> +	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
> +	ras->ras_block.ras_comm.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> +	adev->mca.mp0.ras_if = &ras->ras_block.ras_comm;
> +
> +	return 0;
> +}
> +
> +int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev) {
> +        int err;
> +        struct amdgpu_mca_ras_block *ras;
> +
> +        if (!adev->mca.mp1.ras)
> +                return 0;
> +
> +        ras = adev->mca.mp1.ras;
> +
> +        err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
> +        if (err) {
> +                dev_err(adev->dev, "Failed to register mca.mp1 ras block!\n");
> +                return err;
> +        }
> +
> +        strcpy(ras->ras_block.ras_comm.name, "mca.mp1");
> +        ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
> +        ras->ras_block.ras_comm.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> +        adev->mca.mp1.ras_if = &ras->ras_block.ras_comm;
> +
> +        return 0;
> +}
> +
> +int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev) {
> +        int err;
> +        struct amdgpu_mca_ras_block *ras;
> +
> +        if (!adev->mca.mpio.ras)
> +                return 0;
> +
> +        ras = adev->mca.mpio.ras;
> +
> +        err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
> +        if (err) {
> +                dev_err(adev->dev, "Failed to register mca.mpio ras block!\n");
> +                return err;
> +        }
> +
> +        strcpy(ras->ras_block.ras_comm.name, "mca.mpio");
> +        ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
> +        ras->ras_block.ras_comm.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> +        adev->mca.mpio.ras_if = &ras->ras_block.ras_comm;
> +
> +        return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> index 7ce16d16e34b..997a073e2409 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> @@ -30,12 +30,7 @@ struct amdgpu_mca_ras {
>  	struct amdgpu_mca_ras_block *ras;
>  };
> 
> -struct amdgpu_mca_funcs {
> -	void (*init)(struct amdgpu_device *adev);
> -};
> -
>  struct amdgpu_mca {
> -	const struct amdgpu_mca_funcs *funcs;
>  	struct amdgpu_mca_ras mp0;
>  	struct amdgpu_mca_ras mp1;
>  	struct amdgpu_mca_ras mpio;
> @@ -55,5 +50,7 @@ void amdgpu_mca_reset_error_count(struct
> amdgpu_device *adev,  void amdgpu_mca_query_ras_error_count(struct
> amdgpu_device *adev,
>  				      uint64_t mc_status_addr,
>  				      void *ras_error_status);
> -
> +int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev); int
> +amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev); int
> +amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev);
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9a333f9744bf..67c2a5186b8a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1363,13 +1363,18 @@ static void gmc_v9_0_set_hdp_ras_funcs(struct
> amdgpu_device *adev)
>  	adev->hdp.ras = &hdp_v4_0_ras;
>  }
> 
> -static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)
> +static void gmc_v9_0_set_mca_ras_funcs(struct amdgpu_device *adev)
>  {
> +	struct amdgpu_mca *mca = &adev->mca;
> +
>  	/* is UMC the right IP to check for MCA?  Maybe DF? */
>  	switch (adev->ip_versions[UMC_HWIP][0]) {
>  	case IP_VERSION(6, 7, 0):
> -		if (!adev->gmc.xgmi.connected_to_cpu)
> -			adev->mca.funcs = &mca_v3_0_funcs;
> +		if (!adev->gmc.xgmi.connected_to_cpu) {

[Stanley]: Can we use if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MCA)) instead of if (!adev->gmc.xgmi.connected_to_cpu)
	    to keep the ip ras judgment uniform.

Regards,
Stanley
> +			mca->mp0.ras = &mca_v3_0_mp0_ras;
> +			mca->mp1.ras = &mca_v3_0_mp1_ras;
> +			mca->mpio.ras = &mca_v3_0_mpio_ras;
> +		}
>  		break;
>  	default:
>  		break;
> @@ -1398,7 +1403,7 @@ static int gmc_v9_0_early_init(void *handle)
>  	gmc_v9_0_set_mmhub_ras_funcs(adev);
>  	gmc_v9_0_set_gfxhub_funcs(adev);
>  	gmc_v9_0_set_hdp_ras_funcs(adev);
> -	gmc_v9_0_set_mca_funcs(adev);
> +	gmc_v9_0_set_mca_ras_funcs(adev);
> 
>  	adev->gmc.shared_aperture_start = 0x2000000000000000ULL;
>  	adev->gmc.shared_aperture_end =
> @@ -1611,8 +1616,6 @@ static int gmc_v9_0_sw_init(void *handle)
>  	adev->gfxhub.funcs->init(adev);
> 
>  	adev->mmhub.funcs->init(adev);
> -	if (adev->mca.funcs)
> -		adev->mca.funcs->init(adev);
> 
>  	spin_lock_init(&adev->gmc.invalidate_lock);
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> index d4bd7d1d2649..6dae4a2e2767 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> @@ -51,19 +51,13 @@ static int mca_v3_0_ras_block_match(struct
> amdgpu_ras_block_object *block_obj,
>  	return -EINVAL;
>  }
> 
> -const struct amdgpu_ras_block_hw_ops mca_v3_0_mp0_hw_ops = {
> +static const struct amdgpu_ras_block_hw_ops mca_v3_0_mp0_hw_ops = {
>  	.query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
>  	.query_ras_error_address = NULL,
>  };
> 
>  struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = {
>  	.ras_block = {
> -		.ras_comm = {
> -			.block = AMDGPU_RAS_BLOCK__MCA,
> -			.sub_block_index =
> AMDGPU_RAS_MCA_BLOCK__MP0,
> -			.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
> -			.name = "mp0",
> -		},
>  		.hw_ops = &mca_v3_0_mp0_hw_ops,
>  		.ras_block_match = mca_v3_0_ras_block_match,
>  	},
> @@ -77,19 +71,13 @@ static void
> mca_v3_0_mp1_query_ras_error_count(struct amdgpu_device *adev,
>  				         ras_error_status);
>  }
> 
> -const struct amdgpu_ras_block_hw_ops mca_v3_0_mp1_hw_ops = {
> +static const struct amdgpu_ras_block_hw_ops mca_v3_0_mp1_hw_ops = {
>  	.query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
>  	.query_ras_error_address = NULL,
>  };
> 
>  struct amdgpu_mca_ras_block mca_v3_0_mp1_ras = {
>  	.ras_block = {
> -		.ras_comm = {
> -			.block = AMDGPU_RAS_BLOCK__MCA,
> -			.sub_block_index =
> AMDGPU_RAS_MCA_BLOCK__MP1,
> -			.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
> -			.name = "mp1",
> -		},
>  		.hw_ops = &mca_v3_0_mp1_hw_ops,
>  		.ras_block_match = mca_v3_0_ras_block_match,
>  	},
> @@ -103,40 +91,14 @@ static void
> mca_v3_0_mpio_query_ras_error_count(struct amdgpu_device *adev,
>  				         ras_error_status);
>  }
> 
> -const struct amdgpu_ras_block_hw_ops mca_v3_0_mpio_hw_ops = {
> +static const struct amdgpu_ras_block_hw_ops mca_v3_0_mpio_hw_ops = {
>  	.query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
>  	.query_ras_error_address = NULL,
>  };
> 
>  struct amdgpu_mca_ras_block mca_v3_0_mpio_ras = {
>  	.ras_block = {
> -		.ras_comm = {
> -			.block = AMDGPU_RAS_BLOCK__MCA,
> -			.sub_block_index =
> AMDGPU_RAS_MCA_BLOCK__MPIO,
> -			.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
> -			.name = "mpio",
> -		},
>  		.hw_ops = &mca_v3_0_mpio_hw_ops,
>  		.ras_block_match = mca_v3_0_ras_block_match,
>  	},
>  };
> -
> -
> -static void mca_v3_0_init(struct amdgpu_device *adev) -{
> -	struct amdgpu_mca *mca = &adev->mca;
> -
> -	mca->mp0.ras = &mca_v3_0_mp0_ras;
> -	mca->mp1.ras = &mca_v3_0_mp1_ras;
> -	mca->mpio.ras = &mca_v3_0_mpio_ras;
> -	amdgpu_ras_register_ras_block(adev, &mca->mp0.ras->ras_block);
> -	amdgpu_ras_register_ras_block(adev, &mca->mp1.ras->ras_block);
> -	amdgpu_ras_register_ras_block(adev, &mca->mpio.ras->ras_block);
> -	mca->mp0.ras_if = &mca->mp0.ras->ras_block.ras_comm;
> -	mca->mp1.ras_if = &mca->mp1.ras->ras_block.ras_comm;
> -	mca->mpio.ras_if = &mca->mpio.ras->ras_block.ras_comm;
> -}
> -
> -const struct amdgpu_mca_funcs mca_v3_0_funcs = {
> -	.init = mca_v3_0_init,
> -};
> \ No newline at end of file
> diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h
> b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h
> index b899b86194c2..d3eaef0d7f2d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h
> +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h
> @@ -21,6 +21,8 @@
>  #ifndef __MCA_V3_0_H__
>  #define __MCA_V3_0_H__
> 
> -extern const struct amdgpu_mca_funcs mca_v3_0_funcs;
> +extern struct amdgpu_mca_ras_block mca_v3_0_mp0_ras; extern struct
> +amdgpu_mca_ras_block mca_v3_0_mp1_ras; extern struct
> +amdgpu_mca_ras_block mca_v3_0_mpio_ras;
> 
>  #endif
> --
> 2.17.1



More information about the amd-gfx mailing list