[PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode is supported

Hawking Zhang Hawking.Zhang at amd.com
Sun Jun 11 10:45:46 UTC 2023


GFX v11_0_3 ras needs to be enabled if poison mode
is supported. Driver doesn't need issue an feature
enable call in gfx_v11_0 late init phase. The ras
late init call is already centralized to
amdgpu_ras_late_init.
In addition, move poison_mode check out of common
helper like amdgpu_ras_is_supported and
amdgpu_ras_is_feature_allowed ensure only GFX RAS
is enabled when poison mode is supported.

Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 ++++++++-----------------
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  | 26 -------------
 2 files changed, 16 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index dd7cdc234d7e..35e70860d628 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -126,6 +126,7 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
 				uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 				uint64_t addr);
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev);
 #ifdef CONFIG_X86_MCE_AMD
 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
 struct mce_notifier_adev_list {
@@ -757,16 +758,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 	return 0;
 }
 
-static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev,
-		struct ras_common_if *head)
-{
-	if (amdgpu_ras_is_feature_allowed(adev, head) ||
-		amdgpu_ras_is_poison_mode_supported(adev))
-		return 1;
-	else
-		return 0;
-}
-
 /* wrapper of psp_ras_enable_features */
 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 		struct ras_common_if *head, bool enable)
@@ -797,7 +788,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 	}
 
 	/* Do not enable if it is not allowed. */
-	if (enable && !amdgpu_ras_check_feature_allowed(adev, head))
+	if (enable && !amdgpu_ras_is_feature_allowed(adev, head))
 		goto out;
 
 	/* Only enable ras feature operation handle on host side */
@@ -2420,9 +2411,9 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
 }
 
 /*
- * this is workaround for vega20 workstation sku,
- * force enable gfx ras, ignore vbios gfx ras flag
- * due to GC EDC can not write
+ * Common helpers for device or IP specific RAS quirks including
+ * a). Enable gfx ras on D16406 or D36002 board
+ * b). Enable gfx ras in gfx_v11_0_3 if poison mode is supported
  */
 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
 {
@@ -2431,10 +2422,16 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
 	if (!ctx)
 		return;
 
+	/* Enable gfx ras on specific board */
 	if (strnstr(ctx->vbios_version, "D16406",
 		    sizeof(ctx->vbios_version)) ||
-		strnstr(ctx->vbios_version, "D36002",
-			sizeof(ctx->vbios_version)))
+	    strnstr(ctx->vbios_version, "D36002",
+		    sizeof(ctx->vbios_version)))
+		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
+
+	/* Enable gfx ras on gfx_v11_0_3 if poison mode is supported */
+	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3) &&
+	    amdgpu_ras_is_poison_mode_supported(adev))
 		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
@@ -2502,6 +2499,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 					   1 << AMDGPU_RAS_BLOCK__MMHUB);
 	}
 
+	amdgpu_ras_query_poison_mode(adev);
+
 	amdgpu_ras_get_quirks(adev);
 
 	/* hw_supported needs to be aligned with RAS block mask. */
@@ -2659,8 +2658,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			goto release_con;
 	}
 
-	amdgpu_ras_query_poison_mode(adev);
-
 	if (amdgpu_ras_fs_init(adev)) {
 		r = -EINVAL;
 		goto release_con;
@@ -3115,26 +3112,12 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
 int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 		unsigned int block)
 {
-	int ret = 0;
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
 	if (block >= AMDGPU_RAS_BLOCK_COUNT)
 		return 0;
 
-	ret = ras && (adev->ras_enabled & (1 << block));
-
-	/* For the special asic with mem ecc enabled but sram ecc
-	 * not enabled, even if the ras block is not supported on
-	 * .ras_enabled, if the asic supports poison mode and the
-	 * ras block has ras configuration, it can be considered
-	 * that the ras block supports ras function.
-	 */
-	if (!ret &&
-	    amdgpu_ras_is_poison_mode_supported(adev) &&
-	    amdgpu_ras_get_ras_block(adev, block, 0))
-		ret = 1;
-
-	return ret;
+	return (ras && (adev->ras_enabled & (1 << block)));
 }
 
 int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 690e121d9dda..11e0c574b9f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -4650,26 +4650,6 @@ static int gfx_v11_0_early_init(void *handle)
 	return gfx_v11_0_init_microcode(adev);
 }
 
-static int gfx_v11_0_ras_late_init(void *handle)
-{
-	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-	struct ras_common_if *gfx_common_if;
-	int ret;
-
-	gfx_common_if = kzalloc(sizeof(struct ras_common_if), GFP_KERNEL);
-	if (!gfx_common_if)
-		return -ENOMEM;
-
-	gfx_common_if->block = AMDGPU_RAS_BLOCK__GFX;
-
-	ret = amdgpu_ras_feature_enable(adev, gfx_common_if, true);
-	if (ret)
-		dev_warn(adev->dev, "Failed to enable gfx11 ras feature\n");
-
-	kfree(gfx_common_if);
-	return 0;
-}
-
 static int gfx_v11_0_late_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -4683,12 +4663,6 @@ static int gfx_v11_0_late_init(void *handle)
 	if (r)
 		return r;
 
-	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) {
-		r = gfx_v11_0_ras_late_init(handle);
-		if (r)
-			return r;
-	}
-
 	return 0;
 }
 
-- 
2.17.1



More information about the amd-gfx mailing list