[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

Hawking Zhang Hawking.Zhang at amd.com
Tue Jan 2 03:44:40 UTC 2024


Move ras capablity check to amdgpu_ras_check_supported.
Driver will query ras capablity through psp interace, or
vbios interface, or specific ip callbacks.

Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +++++++++++++-----------
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5f302b7693b3..72b6e41329b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -39,6 +39,7 @@
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_psp.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include <asm/mce.h>
@@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
 		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
+/* Query ras capablity via atomfirmware interface */
+static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev)
+{
+	/* mem_ecc cap */
+	if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+		dev_info(adev->dev, "MEM ECC is active.\n");
+		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+					 1 << AMDGPU_RAS_BLOCK__DF);
+	} else {
+		dev_info(adev->dev, "MEM ECC is not presented.\n");
+	}
+
+	/* sram_ecc cap */
+	if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+		dev_info(adev->dev, "SRAM ECC is active.\n");
+		if (!amdgpu_sriov_vf(adev))
+			adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+						  1 << AMDGPU_RAS_BLOCK__DF);
+		else
+			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
+						 1 << AMDGPU_RAS_BLOCK__SDMA |
+						 1 << AMDGPU_RAS_BLOCK__GFX);
+
+		/*
+		 * VCN/JPEG RAS can be supported on both bare metal and
+		 * SRIOV environment
+		 */
+		if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) ||
+		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) ||
+		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+						 1 << AMDGPU_RAS_BLOCK__JPEG);
+		else
+			adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+						  1 << AMDGPU_RAS_BLOCK__JPEG);
+
+		/*
+		 * XGMI RAS is not supported if xgmi num physical nodes
+		 * is zero
+		 */
+		if (!adev->gmc.xgmi.num_physical_nodes)
+			adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL);
+	} else {
+		dev_info(adev->dev, "SRAM ECC is not presented.\n");
+	}
+}
+
+/* Query poison mode from umc/df IP callbacks */
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	bool df_poison, umc_poison;
+
+	/* poison setting is useless on SRIOV guest */
+	if (amdgpu_sriov_vf(adev) || !con)
+		return;
+
+	/* Init poison supported flag, the default value is false */
+	if (adev->gmc.xgmi.connected_to_cpu ||
+	    adev->gmc.is_app_apu) {
+		/* enabled by default when GPU is connected to CPU */
+		con->poison_supported = true;
+	} else if (adev->df.funcs &&
+	    adev->df.funcs->query_ras_poison_mode &&
+	    adev->umc.ras &&
+	    adev->umc.ras->query_ras_poison_mode) {
+		df_poison =
+			adev->df.funcs->query_ras_poison_mode(adev);
+		umc_poison =
+			adev->umc.ras->query_ras_poison_mode(adev);
+
+		/* Only poison is set in both DF and UMC, we can support it */
+		if (df_poison && umc_poison)
+			con->poison_supported = true;
+		else if (df_poison != umc_poison)
+			dev_warn(adev->dev,
+				"Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+				df_poison, umc_poison);
+	}
+}
+
 /*
  * check hardware's ras ability which will be saved in hw_supported.
  * if hardware does not support ras, we can skip some ras initializtion and
@@ -2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 	if (!amdgpu_ras_asic_supported(adev))
 		return;
 
-	if (!adev->gmc.xgmi.connected_to_cpu &&	!adev->gmc.is_app_apu) {
-		if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
-			dev_info(adev->dev, "MEM ECC is active.\n");
-			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
-						   1 << AMDGPU_RAS_BLOCK__DF);
-		} else {
-			dev_info(adev->dev, "MEM ECC is not presented.\n");
-		}
-
-		if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
-			dev_info(adev->dev, "SRAM ECC is active.\n");
-			if (!amdgpu_sriov_vf(adev))
-				adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
-							    1 << AMDGPU_RAS_BLOCK__DF);
-			else
-				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
-								1 << AMDGPU_RAS_BLOCK__SDMA |
-								1 << AMDGPU_RAS_BLOCK__GFX);
-
-			/* VCN/JPEG RAS can be supported on both bare metal and
-			 * SRIOV environment
-			 */
-			if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
-				    IP_VERSION(2, 6, 0) ||
-			    amdgpu_ip_version(adev, VCN_HWIP, 0) ==
-				    IP_VERSION(4, 0, 0) ||
-			    amdgpu_ip_version(adev, VCN_HWIP, 0) ==
-				    IP_VERSION(4, 0, 3))
-				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
-							1 << AMDGPU_RAS_BLOCK__JPEG);
-			else
-				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
-							1 << AMDGPU_RAS_BLOCK__JPEG);
+	/* query ras capability from psp */
+	if (amdgpu_psp_get_ras_capability(&adev->psp))
+		goto init_ras_enabled_flag;
 
-			/*
-			 * XGMI RAS is not supported if xgmi num physical nodes
-			 * is zero
-			 */
-			if (!adev->gmc.xgmi.num_physical_nodes)
-				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL);
-		} else {
-			dev_info(adev->dev, "SRAM ECC is not presented.\n");
-		}
+	/* query ras capablity from bios */
+	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
+		amdgpu_ras_query_ras_capablity_from_vbios(adev);
 	} else {
 		/* driver only manages a few IP blocks RAS feature
 		 * when GPU is connected cpu through XGMI */
@@ -2747,8 +2793,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 					   1 << AMDGPU_RAS_BLOCK__MMHUB);
 	}
 
+	/* apply asic specific settings (vega20 only for now) */
 	amdgpu_ras_get_quirks(adev);
 
+	/* query poison mode from umc/df ip callback */
+	amdgpu_ras_query_poison_mode(adev);
+
+init_ras_enabled_flag:
 	/* hw_supported needs to be aligned with RAS block mask. */
 	adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
 
@@ -2781,39 +2832,6 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)
 	pm_runtime_put_autosuspend(dev->dev);
 }
 
-static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
-{
-	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-	bool df_poison, umc_poison;
-
-	/* poison setting is useless on SRIOV guest */
-	if (amdgpu_sriov_vf(adev) || !con)
-		return;
-
-	/* Init poison supported flag, the default value is false */
-	if (adev->gmc.xgmi.connected_to_cpu ||
-	    adev->gmc.is_app_apu) {
-		/* enabled by default when GPU is connected to CPU */
-		con->poison_supported = true;
-	} else if (adev->df.funcs &&
-	    adev->df.funcs->query_ras_poison_mode &&
-	    adev->umc.ras &&
-	    adev->umc.ras->query_ras_poison_mode) {
-		df_poison =
-			adev->df.funcs->query_ras_poison_mode(adev);
-		umc_poison =
-			adev->umc.ras->query_ras_poison_mode(adev);
-
-		/* Only poison is set in both DF and UMC, we can support it */
-		if (df_poison && umc_poison)
-			con->poison_supported = true;
-		else if (df_poison != umc_poison)
-			dev_warn(adev->dev,
-				"Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
-				df_poison, umc_poison);
-	}
-}
-
 static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
 {
 	return  amdgpu_ras_is_poison_mode_supported(adev) ? AMDGPU_RAS_ERROR__POISON : 0 |
@@ -2918,8 +2936,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			goto release_con;
 	}
 
-	amdgpu_ras_query_poison_mode(adev);
-
 	/* Get RAS schema for particular SOC */
 	con->schema = amdgpu_get_ras_schema(adev);
 
-- 
2.17.1



More information about the amd-gfx mailing list