[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported
Zhou1, Tao
Tao.Zhou1 at amd.com
Wed Jan 3 02:25:51 UTC 2024
[AMD Official Use Only - General]
The series is:
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang at amd.com>
> Sent: Tuesday, January 2, 2024 10:16 PM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>; Yang,
> Stanley <Stanley.Yang at amd.com>; Wang, Yang(Kevin)
> <KevinYang.Wang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Li,
> Candice <Candice.Li at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Deucher, Alexander
> <Alexander.Deucher at amd.com>; Lazar, Lijo <Lijo.Lazar at amd.com>; Ma, Le
> <Le.Ma at amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to
> amdgpu_ras_check_supported
>
> Move ras capablity check to amdgpu_ras_check_supported.
> Driver will query ras capablity through psp interace, or vbios interface, or specific
> ip callbacks.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +++++++++++++-----------
> 1 file changed, 93 insertions(+), 77 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index a901b00d4949..2ee82baaf7d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -39,6 +39,7 @@
> #include "nbio_v7_9.h"
> #include "atom.h"
> #include "amdgpu_reset.h"
> +#include "amdgpu_psp.h"
>
> #ifdef CONFIG_X86_MCE_AMD
> #include <asm/mce.h>
> @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct
> amdgpu_device *adev)
> adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); }
>
> +/* Query ras capablity via atomfirmware interface */ static void
> +amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) {
> + /* mem_ecc cap */
> + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
> + dev_info(adev->dev, "MEM ECC is active.\n");
> + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
> + 1 << AMDGPU_RAS_BLOCK__DF);
> + } else {
> + dev_info(adev->dev, "MEM ECC is not presented.\n");
> + }
> +
> + /* sram_ecc cap */
> + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
> + dev_info(adev->dev, "SRAM ECC is active.\n");
> + if (!amdgpu_sriov_vf(adev))
> + adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> + 1 <<
> AMDGPU_RAS_BLOCK__DF);
> + else
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> + 1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> + 1 <<
> AMDGPU_RAS_BLOCK__GFX);
> +
> + /*
> + * VCN/JPEG RAS can be supported on both bare metal and
> + * SRIOV environment
> + */
> + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 3))
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> + 1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> + else
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> + 1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +
> + /*
> + * XGMI RAS is not supported if xgmi num physical nodes
> + * is zero
> + */
> + if (!adev->gmc.xgmi.num_physical_nodes)
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__XGMI_WAFL);
> + } else {
> + dev_info(adev->dev, "SRAM ECC is not presented.\n");
> + }
> +}
> +
> +/* Query poison mode from umc/df IP callbacks */ static void
> +amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + bool df_poison, umc_poison;
> +
> + /* poison setting is useless on SRIOV guest */
> + if (amdgpu_sriov_vf(adev) || !con)
> + return;
> +
> + /* Init poison supported flag, the default value is false */
> + if (adev->gmc.xgmi.connected_to_cpu ||
> + adev->gmc.is_app_apu) {
> + /* enabled by default when GPU is connected to CPU */
> + con->poison_supported = true;
> + } else if (adev->df.funcs &&
> + adev->df.funcs->query_ras_poison_mode &&
> + adev->umc.ras &&
> + adev->umc.ras->query_ras_poison_mode) {
> + df_poison =
> + adev->df.funcs->query_ras_poison_mode(adev);
> + umc_poison =
> + adev->umc.ras->query_ras_poison_mode(adev);
> +
> + /* Only poison is set in both DF and UMC, we can support it */
> + if (df_poison && umc_poison)
> + con->poison_supported = true;
> + else if (df_poison != umc_poison)
> + dev_warn(adev->dev,
> + "Poison setting is inconsistent in
> DF/UMC(%d:%d)!\n",
> + df_poison, umc_poison);
> + }
> +}
> +
> /*
> * check hardware's ras ability which will be saved in hw_supported.
> * if hardware does not support ras, we can skip some ras initializtion and @@ -
> 2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct
> amdgpu_device *adev)
> if (!amdgpu_ras_asic_supported(adev))
> return;
>
> - if (!adev->gmc.xgmi.connected_to_cpu && !adev-
> >gmc.is_app_apu) {
> - if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
> - dev_info(adev->dev, "MEM ECC is active.\n");
> - adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__UMC |
> - 1 <<
> AMDGPU_RAS_BLOCK__DF);
> - } else {
> - dev_info(adev->dev, "MEM ECC is not presented.\n");
> - }
> -
> - if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
> - dev_info(adev->dev, "SRAM ECC is active.\n");
> - if (!amdgpu_sriov_vf(adev))
> - adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> - 1 <<
> AMDGPU_RAS_BLOCK__DF);
> - else
> - adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> - 1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> - 1 <<
> AMDGPU_RAS_BLOCK__GFX);
> -
> - /* VCN/JPEG RAS can be supported on both bare metal
> and
> - * SRIOV environment
> - */
> - if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
> - IP_VERSION(2, 6, 0) ||
> - amdgpu_ip_version(adev, VCN_HWIP, 0) ==
> - IP_VERSION(4, 0, 0) ||
> - amdgpu_ip_version(adev, VCN_HWIP, 0) ==
> - IP_VERSION(4, 0, 3))
> - adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> - 1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> - else
> - adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> - 1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> + /* query ras capability from psp */
> + if (amdgpu_psp_get_ras_capability(&adev->psp))
> + goto init_ras_enabled_flag;
>
> - /*
> - * XGMI RAS is not supported if xgmi num physical nodes
> - * is zero
> - */
> - if (!adev->gmc.xgmi.num_physical_nodes)
> - adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__XGMI_WAFL);
> - } else {
> - dev_info(adev->dev, "SRAM ECC is not presented.\n");
> - }
> + /* query ras capablity from bios */
> + if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
> + amdgpu_ras_query_ras_capablity_from_vbios(adev);
> } else {
> /* driver only manages a few IP blocks RAS feature
> * when GPU is connected cpu through XGMI */ @@ -2747,8
> +2793,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device
> *adev)
> 1 <<
> AMDGPU_RAS_BLOCK__MMHUB);
> }
>
> + /* apply asic specific settings (vega20 only for now) */
> amdgpu_ras_get_quirks(adev);
>
> + /* query poison mode from umc/df ip callback */
> + amdgpu_ras_query_poison_mode(adev);
> +
> +init_ras_enabled_flag:
> /* hw_supported needs to be aligned with RAS block mask. */
> adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
>
> @@ -2781,39 +2832,6 @@ static void amdgpu_ras_counte_dw(struct
> work_struct *work)
> pm_runtime_put_autosuspend(dev->dev);
> }
>
> -static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) -{
> - struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> - bool df_poison, umc_poison;
> -
> - /* poison setting is useless on SRIOV guest */
> - if (amdgpu_sriov_vf(adev) || !con)
> - return;
> -
> - /* Init poison supported flag, the default value is false */
> - if (adev->gmc.xgmi.connected_to_cpu ||
> - adev->gmc.is_app_apu) {
> - /* enabled by default when GPU is connected to CPU */
> - con->poison_supported = true;
> - } else if (adev->df.funcs &&
> - adev->df.funcs->query_ras_poison_mode &&
> - adev->umc.ras &&
> - adev->umc.ras->query_ras_poison_mode) {
> - df_poison =
> - adev->df.funcs->query_ras_poison_mode(adev);
> - umc_poison =
> - adev->umc.ras->query_ras_poison_mode(adev);
> -
> - /* Only poison is set in both DF and UMC, we can support it */
> - if (df_poison && umc_poison)
> - con->poison_supported = true;
> - else if (df_poison != umc_poison)
> - dev_warn(adev->dev,
> - "Poison setting is inconsistent in
> DF/UMC(%d:%d)!\n",
> - df_poison, umc_poison);
> - }
> -}
> -
> static int amdgpu_get_ras_schema(struct amdgpu_device *adev) {
> return amdgpu_ras_is_poison_mode_supported(adev) ?
> AMDGPU_RAS_ERROR__POISON : 0 | @@ -2918,8 +2936,6 @@ int
> amdgpu_ras_init(struct amdgpu_device *adev)
> goto release_con;
> }
>
> - amdgpu_ras_query_poison_mode(adev);
> -
> /* Get RAS schema for particular SOC */
> con->schema = amdgpu_get_ras_schema(adev);
>
> --
> 2.17.1
More information about the amd-gfx
mailing list