[PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode is supported

Yang, Stanley Stanley.Yang at amd.com
Mon Jun 12 08:29:54 UTC 2023


[AMD Official Use Only - General]

> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Sunday, June 11, 2023 6:46 PM
> To: amd-gfx at lists.freedesktop.org; Yang, Stanley <Stanley.Yang at amd.com>;
> Li, Candice <Candice.Li at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>;
> Zhou1, Tao <Tao.Zhou1 at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode is
> supported
>
> GFX v11_0_3 ras needs to be enabled if poison mode is supported. Driver
> doesn't need issue an feature enable call in gfx_v11_0 late init phase. The ras
> late init call is already centralized to amdgpu_ras_late_init.
> In addition, move poison_mode check out of common helper like
> amdgpu_ras_is_supported and amdgpu_ras_is_feature_allowed ensure only
> GFX RAS is enabled when poison mode is supported.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 ++++++++-----------------
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  | 26 -------------
>  2 files changed, 16 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index dd7cdc234d7e..35e70860d628 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -126,6 +126,7 @@ static bool
> amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
>                               uint64_t addr);
>  static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
>                               uint64_t addr);
> +static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev);
>  #ifdef CONFIG_X86_MCE_AMD
>  static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device
> *adev);  struct mce_notifier_adev_list { @@ -757,16 +758,6 @@ static int
> __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
>       return 0;
>  }
>
> -static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev,
> -             struct ras_common_if *head)
> -{
> -     if (amdgpu_ras_is_feature_allowed(adev, head) ||
> -             amdgpu_ras_is_poison_mode_supported(adev))
> -             return 1;
> -     else
> -             return 0;
> -}
> -
>  /* wrapper of psp_ras_enable_features */  int
> amdgpu_ras_feature_enable(struct amdgpu_device *adev,
>               struct ras_common_if *head, bool enable) @@ -797,7 +788,7
> @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
>       }
>
>       /* Do not enable if it is not allowed. */
> -     if (enable && !amdgpu_ras_check_feature_allowed(adev, head))
> +     if (enable && !amdgpu_ras_is_feature_allowed(adev, head))
>               goto out;
>
>       /* Only enable ras feature operation handle on host side */ @@ -
> 2420,9 +2411,9 @@ static bool amdgpu_ras_asic_supported(struct
> amdgpu_device *adev)  }
>
>  /*
> - * this is workaround for vega20 workstation sku,
> - * force enable gfx ras, ignore vbios gfx ras flag
> - * due to GC EDC can not write
> + * Common helpers for device or IP specific RAS quirks including
> + * a). Enable gfx ras on D16406 or D36002 board
> + * b). Enable gfx ras in gfx_v11_0_3 if poison mode is supported
>   */
>  static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)  { @@ -
> 2431,10 +2422,16 @@ static void amdgpu_ras_get_quirks(struct
> amdgpu_device *adev)
>       if (!ctx)
>               return;
>
> +     /* Enable gfx ras on specific board */
>       if (strnstr(ctx->vbios_version, "D16406",
>                   sizeof(ctx->vbios_version)) ||
> -             strnstr(ctx->vbios_version, "D36002",
> -                     sizeof(ctx->vbios_version)))
> +         strnstr(ctx->vbios_version, "D36002",
> +                 sizeof(ctx->vbios_version)))
> +             adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__GFX);
> +
> +     /* Enable gfx ras on gfx_v11_0_3 if poison mode is supported */
> +     if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3) &&
> +         amdgpu_ras_is_poison_mode_supported(adev))
>               adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__GFX);  }

[Stanley]: For GC 11.0.3, it's better not expose AMDGPU_RAS_BLOCK__GFX to kfd, may be with below is more reasonable.
{
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 8e4124dcb6e4..84030289ac96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1996,9 +1996,10 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
        }

        /* kfd only concerns sram ecc on GFX and HBM ecc on UMC */
-       dev->node_props.capability |=
-               ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
-               HSA_CAP_SRAM_EDCSUPPORTED : 0;
+       if (KFD_GC_VERSION(dev->gpu) != IP_VERSION(11, 0, 3))
+               dev->node_props.capability |=
+                       ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
+                       HSA_CAP_SRAM_EDCSUPPORTED : 0;
        dev->node_props.capability |=
                ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
                HSA_CAP_MEM_EDCSUPPORTED : 0;
--
}

Regards,
Stanley
>
> @@ -2502,6 +2499,8 @@ static void amdgpu_ras_check_supported(struct
> amdgpu_device *adev)
>                                          1 <<
> AMDGPU_RAS_BLOCK__MMHUB);
>       }
>
> +     amdgpu_ras_query_poison_mode(adev);
> +
>       amdgpu_ras_get_quirks(adev);
>
>       /* hw_supported needs to be aligned with RAS block mask. */ @@ -
> 2659,8 +2658,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>                       goto release_con;
>       }
>
> -     amdgpu_ras_query_poison_mode(adev);
> -
>       if (amdgpu_ras_fs_init(adev)) {
>               r = -EINVAL;
>               goto release_con;
> @@ -3115,26 +3112,12 @@ int amdgpu_ras_set_context(struct
> amdgpu_device *adev, struct amdgpu_ras *ras_co  int
> amdgpu_ras_is_supported(struct amdgpu_device *adev,
>               unsigned int block)
>  {
> -     int ret = 0;
>       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>
>       if (block >= AMDGPU_RAS_BLOCK_COUNT)
>               return 0;
>
> -     ret = ras && (adev->ras_enabled & (1 << block));
> -
> -     /* For the special asic with mem ecc enabled but sram ecc
> -      * not enabled, even if the ras block is not supported on
> -      * .ras_enabled, if the asic supports poison mode and the
> -      * ras block has ras configuration, it can be considered
> -      * that the ras block supports ras function.
> -      */
> -     if (!ret &&
> -         amdgpu_ras_is_poison_mode_supported(adev) &&
> -         amdgpu_ras_get_ras_block(adev, block, 0))
> -             ret = 1;
> -
> -     return ret;
> +     return (ras && (adev->ras_enabled & (1 << block)));
>  }
>
>  int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) diff --git
> a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 690e121d9dda..11e0c574b9f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -4650,26 +4650,6 @@ static int gfx_v11_0_early_init(void *handle)
>       return gfx_v11_0_init_microcode(adev);  }
>
> -static int gfx_v11_0_ras_late_init(void *handle) -{
> -     struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -     struct ras_common_if *gfx_common_if;
> -     int ret;
> -
> -     gfx_common_if = kzalloc(sizeof(struct ras_common_if), GFP_KERNEL);
> -     if (!gfx_common_if)
> -             return -ENOMEM;
> -
> -     gfx_common_if->block = AMDGPU_RAS_BLOCK__GFX;
> -
> -     ret = amdgpu_ras_feature_enable(adev, gfx_common_if, true);
> -     if (ret)
> -             dev_warn(adev->dev, "Failed to enable gfx11 ras feature\n");
> -
> -     kfree(gfx_common_if);
> -     return 0;
> -}
> -
>  static int gfx_v11_0_late_init(void *handle)  {
>       struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -
> 4683,12 +4663,6 @@ static int gfx_v11_0_late_init(void *handle)
>       if (r)
>               return r;
>
> -     if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) {
> -             r = gfx_v11_0_ras_late_init(handle);
> -             if (r)
> -                     return r;
> -     }
> -
>       return 0;
>  }
>
> --
> 2.17.1



More information about the amd-gfx mailing list