[PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode is supported
Yang, Stanley
Stanley.Yang at amd.com
Mon Jun 12 08:29:54 UTC 2023
[AMD Official Use Only - General]
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Sunday, June 11, 2023 6:46 PM
> To: amd-gfx at lists.freedesktop.org; Yang, Stanley <Stanley.Yang at amd.com>;
> Li, Candice <Candice.Li at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>;
> Zhou1, Tao <Tao.Zhou1 at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode is
> supported
>
> GFX v11_0_3 ras needs to be enabled if poison mode is supported. Driver
> doesn't need issue an feature enable call in gfx_v11_0 late init phase. The ras
> late init call is already centralized to amdgpu_ras_late_init.
> In addition, move poison_mode check out of common helper like
> amdgpu_ras_is_supported and amdgpu_ras_is_feature_allowed ensure only
> GFX RAS is enabled when poison mode is supported.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 ++++++++-----------------
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 26 -------------
> 2 files changed, 16 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index dd7cdc234d7e..35e70860d628 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -126,6 +126,7 @@ static bool
> amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
> uint64_t addr);
> static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
> uint64_t addr);
> +static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev);
> #ifdef CONFIG_X86_MCE_AMD
> static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device
> *adev); struct mce_notifier_adev_list { @@ -757,16 +758,6 @@ static int
> __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
> return 0;
> }
>
> -static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev,
> - struct ras_common_if *head)
> -{
> - if (amdgpu_ras_is_feature_allowed(adev, head) ||
> - amdgpu_ras_is_poison_mode_supported(adev))
> - return 1;
> - else
> - return 0;
> -}
> -
> /* wrapper of psp_ras_enable_features */ int
> amdgpu_ras_feature_enable(struct amdgpu_device *adev,
> struct ras_common_if *head, bool enable) @@ -797,7 +788,7
> @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
> }
>
> /* Do not enable if it is not allowed. */
> - if (enable && !amdgpu_ras_check_feature_allowed(adev, head))
> + if (enable && !amdgpu_ras_is_feature_allowed(adev, head))
> goto out;
>
> /* Only enable ras feature operation handle on host side */ @@ -
> 2420,9 +2411,9 @@ static bool amdgpu_ras_asic_supported(struct
> amdgpu_device *adev) }
>
> /*
> - * this is workaround for vega20 workstation sku,
> - * force enable gfx ras, ignore vbios gfx ras flag
> - * due to GC EDC can not write
> + * Common helpers for device or IP specific RAS quirks including
> + * a). Enable gfx ras on D16406 or D36002 board
> + * b). Enable gfx ras in gfx_v11_0_3 if poison mode is supported
> */
> static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) { @@ -
> 2431,10 +2422,16 @@ static void amdgpu_ras_get_quirks(struct
> amdgpu_device *adev)
> if (!ctx)
> return;
>
> + /* Enable gfx ras on specific board */
> if (strnstr(ctx->vbios_version, "D16406",
> sizeof(ctx->vbios_version)) ||
> - strnstr(ctx->vbios_version, "D36002",
> - sizeof(ctx->vbios_version)))
> + strnstr(ctx->vbios_version, "D36002",
> + sizeof(ctx->vbios_version)))
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__GFX);
> +
> + /* Enable gfx ras on gfx_v11_0_3 if poison mode is supported */
> + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3) &&
> + amdgpu_ras_is_poison_mode_supported(adev))
> adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__GFX); }
[Stanley]: For GC 11.0.3, it's better not expose AMDGPU_RAS_BLOCK__GFX to kfd, may be with below is more reasonable.
{
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 8e4124dcb6e4..84030289ac96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1996,9 +1996,10 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
}
/* kfd only concerns sram ecc on GFX and HBM ecc on UMC */
- dev->node_props.capability |=
- ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
- HSA_CAP_SRAM_EDCSUPPORTED : 0;
+ if (KFD_GC_VERSION(dev->gpu) != IP_VERSION(11, 0, 3))
+ dev->node_props.capability |=
+ ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
+ HSA_CAP_SRAM_EDCSUPPORTED : 0;
dev->node_props.capability |=
((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
HSA_CAP_MEM_EDCSUPPORTED : 0;
--
}
Regards,
Stanley
>
> @@ -2502,6 +2499,8 @@ static void amdgpu_ras_check_supported(struct
> amdgpu_device *adev)
> 1 <<
> AMDGPU_RAS_BLOCK__MMHUB);
> }
>
> + amdgpu_ras_query_poison_mode(adev);
> +
> amdgpu_ras_get_quirks(adev);
>
> /* hw_supported needs to be aligned with RAS block mask. */ @@ -
> 2659,8 +2658,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
> goto release_con;
> }
>
> - amdgpu_ras_query_poison_mode(adev);
> -
> if (amdgpu_ras_fs_init(adev)) {
> r = -EINVAL;
> goto release_con;
> @@ -3115,26 +3112,12 @@ int amdgpu_ras_set_context(struct
> amdgpu_device *adev, struct amdgpu_ras *ras_co int
> amdgpu_ras_is_supported(struct amdgpu_device *adev,
> unsigned int block)
> {
> - int ret = 0;
> struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>
> if (block >= AMDGPU_RAS_BLOCK_COUNT)
> return 0;
>
> - ret = ras && (adev->ras_enabled & (1 << block));
> -
> - /* For the special asic with mem ecc enabled but sram ecc
> - * not enabled, even if the ras block is not supported on
> - * .ras_enabled, if the asic supports poison mode and the
> - * ras block has ras configuration, it can be considered
> - * that the ras block supports ras function.
> - */
> - if (!ret &&
> - amdgpu_ras_is_poison_mode_supported(adev) &&
> - amdgpu_ras_get_ras_block(adev, block, 0))
> - ret = 1;
> -
> - return ret;
> + return (ras && (adev->ras_enabled & (1 << block)));
> }
>
> int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) diff --git
> a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 690e121d9dda..11e0c574b9f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -4650,26 +4650,6 @@ static int gfx_v11_0_early_init(void *handle)
> return gfx_v11_0_init_microcode(adev); }
>
> -static int gfx_v11_0_ras_late_init(void *handle) -{
> - struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> - struct ras_common_if *gfx_common_if;
> - int ret;
> -
> - gfx_common_if = kzalloc(sizeof(struct ras_common_if), GFP_KERNEL);
> - if (!gfx_common_if)
> - return -ENOMEM;
> -
> - gfx_common_if->block = AMDGPU_RAS_BLOCK__GFX;
> -
> - ret = amdgpu_ras_feature_enable(adev, gfx_common_if, true);
> - if (ret)
> - dev_warn(adev->dev, "Failed to enable gfx11 ras feature\n");
> -
> - kfree(gfx_common_if);
> - return 0;
> -}
> -
> static int gfx_v11_0_late_init(void *handle) {
> struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -
> 4683,12 +4663,6 @@ static int gfx_v11_0_late_init(void *handle)
> if (r)
> return r;
>
> - if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) {
> - r = gfx_v11_0_ras_late_init(handle);
> - if (r)
> - return r;
> - }
> -
> return 0;
> }
>
> --
> 2.17.1
More information about the amd-gfx
mailing list