[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

Zhou1, Tao Tao.Zhou1 at amd.com
Wed Jan 3 02:25:51 UTC 2024


[AMD Official Use Only - General]

The series is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang at amd.com>
> Sent: Tuesday, January 2, 2024 10:16 PM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>; Yang,
> Stanley <Stanley.Yang at amd.com>; Wang, Yang(Kevin)
> <KevinYang.Wang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Li,
> Candice <Candice.Li at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Deucher, Alexander
> <Alexander.Deucher at amd.com>; Lazar, Lijo <Lijo.Lazar at amd.com>; Ma, Le
> <Le.Ma at amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to
> amdgpu_ras_check_supported
>
> Move ras capablity check to amdgpu_ras_check_supported.
> Driver will query ras capablity through psp interace, or vbios interface, or specific
> ip callbacks.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +++++++++++++-----------
>  1 file changed, 93 insertions(+), 77 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index a901b00d4949..2ee82baaf7d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -39,6 +39,7 @@
>  #include "nbio_v7_9.h"
>  #include "atom.h"
>  #include "amdgpu_reset.h"
> +#include "amdgpu_psp.h"
>
>  #ifdef CONFIG_X86_MCE_AMD
>  #include <asm/mce.h>
> @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct
> amdgpu_device *adev)
>               adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);  }
>
> +/* Query ras capablity via atomfirmware interface */ static void
> +amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) {
> +     /* mem_ecc cap */
> +     if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
> +             dev_info(adev->dev, "MEM ECC is active.\n");
> +             adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
> +                                      1 << AMDGPU_RAS_BLOCK__DF);
> +     } else {
> +             dev_info(adev->dev, "MEM ECC is not presented.\n");
> +     }
> +
> +     /* sram_ecc cap */
> +     if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
> +             dev_info(adev->dev, "SRAM ECC is active.\n");
> +             if (!amdgpu_sriov_vf(adev))
> +                     adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> +                                               1 <<
> AMDGPU_RAS_BLOCK__DF);
> +             else
> +                     adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> +                                              1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> +                                              1 <<
> AMDGPU_RAS_BLOCK__GFX);
> +
> +             /*
> +              * VCN/JPEG RAS can be supported on both bare metal and
> +              * SRIOV environment
> +              */
> +             if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6,
> 0) ||
> +                 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 0) ||
> +                 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 3))
> +                     adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +                                              1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +             else
> +                     adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +                                               1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +
> +             /*
> +              * XGMI RAS is not supported if xgmi num physical nodes
> +              * is zero
> +              */
> +             if (!adev->gmc.xgmi.num_physical_nodes)
> +                     adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__XGMI_WAFL);
> +     } else {
> +             dev_info(adev->dev, "SRAM ECC is not presented.\n");
> +     }
> +}
> +
> +/* Query poison mode from umc/df IP callbacks */ static void
> +amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) {
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     bool df_poison, umc_poison;
> +
> +     /* poison setting is useless on SRIOV guest */
> +     if (amdgpu_sriov_vf(adev) || !con)
> +             return;
> +
> +     /* Init poison supported flag, the default value is false */
> +     if (adev->gmc.xgmi.connected_to_cpu ||
> +         adev->gmc.is_app_apu) {
> +             /* enabled by default when GPU is connected to CPU */
> +             con->poison_supported = true;
> +     } else if (adev->df.funcs &&
> +         adev->df.funcs->query_ras_poison_mode &&
> +         adev->umc.ras &&
> +         adev->umc.ras->query_ras_poison_mode) {
> +             df_poison =
> +                     adev->df.funcs->query_ras_poison_mode(adev);
> +             umc_poison =
> +                     adev->umc.ras->query_ras_poison_mode(adev);
> +
> +             /* Only poison is set in both DF and UMC, we can support it */
> +             if (df_poison && umc_poison)
> +                     con->poison_supported = true;
> +             else if (df_poison != umc_poison)
> +                     dev_warn(adev->dev,
> +                             "Poison setting is inconsistent in
> DF/UMC(%d:%d)!\n",
> +                             df_poison, umc_poison);
> +     }
> +}
> +
>  /*
>   * check hardware's ras ability which will be saved in hw_supported.
>   * if hardware does not support ras, we can skip some ras initializtion and @@ -
> 2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct
> amdgpu_device *adev)
>       if (!amdgpu_ras_asic_supported(adev))
>               return;
>
> -     if (!adev->gmc.xgmi.connected_to_cpu && !adev-
> >gmc.is_app_apu) {
> -             if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
> -                     dev_info(adev->dev, "MEM ECC is active.\n");
> -                     adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__UMC |
> -                                                1 <<
> AMDGPU_RAS_BLOCK__DF);
> -             } else {
> -                     dev_info(adev->dev, "MEM ECC is not presented.\n");
> -             }
> -
> -             if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
> -                     dev_info(adev->dev, "SRAM ECC is active.\n");
> -                     if (!amdgpu_sriov_vf(adev))
> -                             adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> -                                                         1 <<
> AMDGPU_RAS_BLOCK__DF);
> -                     else
> -                             adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> -                                                             1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> -                                                             1 <<
> AMDGPU_RAS_BLOCK__GFX);
> -
> -                     /* VCN/JPEG RAS can be supported on both bare metal
> and
> -                      * SRIOV environment
> -                      */
> -                     if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
> -                                 IP_VERSION(2, 6, 0) ||
> -                         amdgpu_ip_version(adev, VCN_HWIP, 0) ==
> -                                 IP_VERSION(4, 0, 0) ||
> -                         amdgpu_ip_version(adev, VCN_HWIP, 0) ==
> -                                 IP_VERSION(4, 0, 3))
> -                             adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> -                                                     1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> -                     else
> -                             adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> -                                                     1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +     /* query ras capability from psp */
> +     if (amdgpu_psp_get_ras_capability(&adev->psp))
> +             goto init_ras_enabled_flag;
>
> -                     /*
> -                      * XGMI RAS is not supported if xgmi num physical nodes
> -                      * is zero
> -                      */
> -                     if (!adev->gmc.xgmi.num_physical_nodes)
> -                             adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__XGMI_WAFL);
> -             } else {
> -                     dev_info(adev->dev, "SRAM ECC is not presented.\n");
> -             }
> +     /* query ras capablity from bios */
> +     if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
> +             amdgpu_ras_query_ras_capablity_from_vbios(adev);
>       } else {
>               /* driver only manages a few IP blocks RAS feature
>                * when GPU is connected cpu through XGMI */ @@ -2747,8
> +2793,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device
> *adev)
>                                          1 <<
> AMDGPU_RAS_BLOCK__MMHUB);
>       }
>
> +     /* apply asic specific settings (vega20 only for now) */
>       amdgpu_ras_get_quirks(adev);
>
> +     /* query poison mode from umc/df ip callback */
> +     amdgpu_ras_query_poison_mode(adev);
> +
> +init_ras_enabled_flag:
>       /* hw_supported needs to be aligned with RAS block mask. */
>       adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
>
> @@ -2781,39 +2832,6 @@ static void amdgpu_ras_counte_dw(struct
> work_struct *work)
>       pm_runtime_put_autosuspend(dev->dev);
>  }
>
> -static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) -{
> -     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> -     bool df_poison, umc_poison;
> -
> -     /* poison setting is useless on SRIOV guest */
> -     if (amdgpu_sriov_vf(adev) || !con)
> -             return;
> -
> -     /* Init poison supported flag, the default value is false */
> -     if (adev->gmc.xgmi.connected_to_cpu ||
> -         adev->gmc.is_app_apu) {
> -             /* enabled by default when GPU is connected to CPU */
> -             con->poison_supported = true;
> -     } else if (adev->df.funcs &&
> -         adev->df.funcs->query_ras_poison_mode &&
> -         adev->umc.ras &&
> -         adev->umc.ras->query_ras_poison_mode) {
> -             df_poison =
> -                     adev->df.funcs->query_ras_poison_mode(adev);
> -             umc_poison =
> -                     adev->umc.ras->query_ras_poison_mode(adev);
> -
> -             /* Only poison is set in both DF and UMC, we can support it */
> -             if (df_poison && umc_poison)
> -                     con->poison_supported = true;
> -             else if (df_poison != umc_poison)
> -                     dev_warn(adev->dev,
> -                             "Poison setting is inconsistent in
> DF/UMC(%d:%d)!\n",
> -                             df_poison, umc_poison);
> -     }
> -}
> -
>  static int amdgpu_get_ras_schema(struct amdgpu_device *adev)  {
>       return  amdgpu_ras_is_poison_mode_supported(adev) ?
> AMDGPU_RAS_ERROR__POISON : 0 | @@ -2918,8 +2936,6 @@ int
> amdgpu_ras_init(struct amdgpu_device *adev)
>                       goto release_con;
>       }
>
> -     amdgpu_ras_query_poison_mode(adev);
> -
>       /* Get RAS schema for particular SOC */
>       con->schema = amdgpu_get_ras_schema(adev);
>
> --
> 2.17.1



More information about the amd-gfx mailing list