[PATCH 08/11] drm/amdgpu: Rework mca ras sw_init
Yang, Stanley
Stanley.Yang at amd.com
Mon Mar 6 07:24:53 UTC 2023
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Monday, March 6, 2023 10:32 AM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>;
> Yang, Stanley <Stanley.Yang at amd.com>; Li, Candice <Candice.Li at amd.com>;
> Chai, Thomas <YiPeng.Chai at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH 08/11] drm/amdgpu: Rework mca ras sw_init
>
> To align with other IP blocks
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 21 ++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 72
> +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 9 ++--
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 15 +++---
> drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 44 ++-------------
> drivers/gpu/drm/amd/amdgpu/mca_v3_0.h | 4 +-
> 6 files changed, 111 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 087a75374610..524e2c9b3012 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -477,6 +477,27 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device
> *adev)
> }
> }
>
> + /* mca.x ras block */
> + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MCA))
> {
> + r = amdgpu_mca_mp0_ras_sw_init(adev);
> + if (r) {
> + dev_err(adev->dev, "Failed to initialize mca.mp0 ras
> block!\n");
> + return r;
> + }
> +
> + r = amdgpu_mca_mp1_ras_sw_init(adev);
> + if (r) {
> + dev_err(adev->dev, "Failed to initialize mca.mp1 ras
> block!\n");
> + return r;
> + }
> +
> + r = amdgpu_mca_mpio_ras_sw_init(adev);
> + if (r) {
> + dev_err(adev->dev, "Failed to initialize mca.mpio ras
> block!\n");
> + return r;
> + }
> + }
> +
> /* xgmi ras block */
> if (amdgpu_ras_is_supported(adev,
> AMDGPU_RAS_BLOCK__XGMI_WAFL)) {
> adev->gmc.xgmi.ras = &xgmi_ras;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> index 51c2a82e2fa4..0b545bdcd636 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> @@ -70,3 +70,75 @@ void amdgpu_mca_query_ras_error_count(struct
> amdgpu_device *adev,
>
> amdgpu_mca_reset_error_count(adev, mc_status_addr); }
> +
> +int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev) {
> + int err;
> + struct amdgpu_mca_ras_block *ras;
> +
> + if (!adev->mca.mp0.ras)
> + return 0;
> +
> + ras = adev->mca.mp0.ras;
> +
> + err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
> + if (err) {
> + dev_err(adev->dev, "Failed to register mca.mp0 ras
> block!\n");
> + return err;
> + }
> +
> + strcpy(ras->ras_block.ras_comm.name, "mca.mp0");
> + ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
> + ras->ras_block.ras_comm.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> + adev->mca.mp0.ras_if = &ras->ras_block.ras_comm;
> +
> + return 0;
> +}
> +
> +int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev) {
> + int err;
> + struct amdgpu_mca_ras_block *ras;
> +
> + if (!adev->mca.mp1.ras)
> + return 0;
> +
> + ras = adev->mca.mp1.ras;
> +
> + err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
> + if (err) {
> + dev_err(adev->dev, "Failed to register mca.mp1 ras block!\n");
> + return err;
> + }
> +
> + strcpy(ras->ras_block.ras_comm.name, "mca.mp1");
> + ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
> + ras->ras_block.ras_comm.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> + adev->mca.mp1.ras_if = &ras->ras_block.ras_comm;
> +
> + return 0;
> +}
> +
> +int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev) {
> + int err;
> + struct amdgpu_mca_ras_block *ras;
> +
> + if (!adev->mca.mpio.ras)
> + return 0;
> +
> + ras = adev->mca.mpio.ras;
> +
> + err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
> + if (err) {
> + dev_err(adev->dev, "Failed to register mca.mpio ras block!\n");
> + return err;
> + }
> +
> + strcpy(ras->ras_block.ras_comm.name, "mca.mpio");
> + ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
> + ras->ras_block.ras_comm.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> + adev->mca.mpio.ras_if = &ras->ras_block.ras_comm;
> +
> + return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> index 7ce16d16e34b..997a073e2409 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> @@ -30,12 +30,7 @@ struct amdgpu_mca_ras {
> struct amdgpu_mca_ras_block *ras;
> };
>
> -struct amdgpu_mca_funcs {
> - void (*init)(struct amdgpu_device *adev);
> -};
> -
> struct amdgpu_mca {
> - const struct amdgpu_mca_funcs *funcs;
> struct amdgpu_mca_ras mp0;
> struct amdgpu_mca_ras mp1;
> struct amdgpu_mca_ras mpio;
> @@ -55,5 +50,7 @@ void amdgpu_mca_reset_error_count(struct
> amdgpu_device *adev, void amdgpu_mca_query_ras_error_count(struct
> amdgpu_device *adev,
> uint64_t mc_status_addr,
> void *ras_error_status);
> -
> +int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev); int
> +amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev); int
> +amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev);
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9a333f9744bf..67c2a5186b8a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1363,13 +1363,18 @@ static void gmc_v9_0_set_hdp_ras_funcs(struct
> amdgpu_device *adev)
> adev->hdp.ras = &hdp_v4_0_ras;
> }
>
> -static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)
> +static void gmc_v9_0_set_mca_ras_funcs(struct amdgpu_device *adev)
> {
> + struct amdgpu_mca *mca = &adev->mca;
> +
> /* is UMC the right IP to check for MCA? Maybe DF? */
> switch (adev->ip_versions[UMC_HWIP][0]) {
> case IP_VERSION(6, 7, 0):
> - if (!adev->gmc.xgmi.connected_to_cpu)
> - adev->mca.funcs = &mca_v3_0_funcs;
> + if (!adev->gmc.xgmi.connected_to_cpu) {
[Stanley]: Can we use if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MCA)) instead of if (!adev->gmc.xgmi.connected_to_cpu)
to keep the ip ras judgment uniform.
Regards,
Stanley
> + mca->mp0.ras = &mca_v3_0_mp0_ras;
> + mca->mp1.ras = &mca_v3_0_mp1_ras;
> + mca->mpio.ras = &mca_v3_0_mpio_ras;
> + }
> break;
> default:
> break;
> @@ -1398,7 +1403,7 @@ static int gmc_v9_0_early_init(void *handle)
> gmc_v9_0_set_mmhub_ras_funcs(adev);
> gmc_v9_0_set_gfxhub_funcs(adev);
> gmc_v9_0_set_hdp_ras_funcs(adev);
> - gmc_v9_0_set_mca_funcs(adev);
> + gmc_v9_0_set_mca_ras_funcs(adev);
>
> adev->gmc.shared_aperture_start = 0x2000000000000000ULL;
> adev->gmc.shared_aperture_end =
> @@ -1611,8 +1616,6 @@ static int gmc_v9_0_sw_init(void *handle)
> adev->gfxhub.funcs->init(adev);
>
> adev->mmhub.funcs->init(adev);
> - if (adev->mca.funcs)
> - adev->mca.funcs->init(adev);
>
> spin_lock_init(&adev->gmc.invalidate_lock);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> index d4bd7d1d2649..6dae4a2e2767 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> @@ -51,19 +51,13 @@ static int mca_v3_0_ras_block_match(struct
> amdgpu_ras_block_object *block_obj,
> return -EINVAL;
> }
>
> -const struct amdgpu_ras_block_hw_ops mca_v3_0_mp0_hw_ops = {
> +static const struct amdgpu_ras_block_hw_ops mca_v3_0_mp0_hw_ops = {
> .query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
> .query_ras_error_address = NULL,
> };
>
> struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = {
> .ras_block = {
> - .ras_comm = {
> - .block = AMDGPU_RAS_BLOCK__MCA,
> - .sub_block_index =
> AMDGPU_RAS_MCA_BLOCK__MP0,
> - .type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
> - .name = "mp0",
> - },
> .hw_ops = &mca_v3_0_mp0_hw_ops,
> .ras_block_match = mca_v3_0_ras_block_match,
> },
> @@ -77,19 +71,13 @@ static void
> mca_v3_0_mp1_query_ras_error_count(struct amdgpu_device *adev,
> ras_error_status);
> }
>
> -const struct amdgpu_ras_block_hw_ops mca_v3_0_mp1_hw_ops = {
> +static const struct amdgpu_ras_block_hw_ops mca_v3_0_mp1_hw_ops = {
> .query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
> .query_ras_error_address = NULL,
> };
>
> struct amdgpu_mca_ras_block mca_v3_0_mp1_ras = {
> .ras_block = {
> - .ras_comm = {
> - .block = AMDGPU_RAS_BLOCK__MCA,
> - .sub_block_index =
> AMDGPU_RAS_MCA_BLOCK__MP1,
> - .type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
> - .name = "mp1",
> - },
> .hw_ops = &mca_v3_0_mp1_hw_ops,
> .ras_block_match = mca_v3_0_ras_block_match,
> },
> @@ -103,40 +91,14 @@ static void
> mca_v3_0_mpio_query_ras_error_count(struct amdgpu_device *adev,
> ras_error_status);
> }
>
> -const struct amdgpu_ras_block_hw_ops mca_v3_0_mpio_hw_ops = {
> +static const struct amdgpu_ras_block_hw_ops mca_v3_0_mpio_hw_ops = {
> .query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
> .query_ras_error_address = NULL,
> };
>
> struct amdgpu_mca_ras_block mca_v3_0_mpio_ras = {
> .ras_block = {
> - .ras_comm = {
> - .block = AMDGPU_RAS_BLOCK__MCA,
> - .sub_block_index =
> AMDGPU_RAS_MCA_BLOCK__MPIO,
> - .type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
> - .name = "mpio",
> - },
> .hw_ops = &mca_v3_0_mpio_hw_ops,
> .ras_block_match = mca_v3_0_ras_block_match,
> },
> };
> -
> -
> -static void mca_v3_0_init(struct amdgpu_device *adev) -{
> - struct amdgpu_mca *mca = &adev->mca;
> -
> - mca->mp0.ras = &mca_v3_0_mp0_ras;
> - mca->mp1.ras = &mca_v3_0_mp1_ras;
> - mca->mpio.ras = &mca_v3_0_mpio_ras;
> - amdgpu_ras_register_ras_block(adev, &mca->mp0.ras->ras_block);
> - amdgpu_ras_register_ras_block(adev, &mca->mp1.ras->ras_block);
> - amdgpu_ras_register_ras_block(adev, &mca->mpio.ras->ras_block);
> - mca->mp0.ras_if = &mca->mp0.ras->ras_block.ras_comm;
> - mca->mp1.ras_if = &mca->mp1.ras->ras_block.ras_comm;
> - mca->mpio.ras_if = &mca->mpio.ras->ras_block.ras_comm;
> -}
> -
> -const struct amdgpu_mca_funcs mca_v3_0_funcs = {
> - .init = mca_v3_0_init,
> -};
> \ No newline at end of file
> diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h
> b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h
> index b899b86194c2..d3eaef0d7f2d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h
> +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h
> @@ -21,6 +21,8 @@
> #ifndef __MCA_V3_0_H__
> #define __MCA_V3_0_H__
>
> -extern const struct amdgpu_mca_funcs mca_v3_0_funcs;
> +extern struct amdgpu_mca_ras_block mca_v3_0_mp0_ras; extern struct
> +amdgpu_mca_ras_block mca_v3_0_mp1_ras; extern struct
> +amdgpu_mca_ras_block mca_v3_0_mpio_ras;
>
> #endif
> --
> 2.17.1
More information about the amd-gfx
mailing list