[PATCH] drm/amdgpu: Add debug option to enable mode2 for poison recovery

Hawking Zhang Hawking.Zhang at amd.com
Mon Aug 12 03:25:46 UTC 2024


Add debug option to enable mode2 for poison recovery
for testing purpose only.

Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h             |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c         |  6 ++++++
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 16 ++++++++++------
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e6b641cb362a..c34819f947ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1201,6 +1201,7 @@ struct amdgpu_device {
 	bool                            debug_disable_soft_recovery;
 	bool                            debug_use_vram_fw_buf;
 	bool                            debug_enable_ras_aca;
+	bool				debug_mode2_for_poison_recovery;
 };
 
 static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index afe3b8bd35a1..be6b920933d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -133,6 +133,7 @@ enum AMDGPU_DEBUG_MASK {
 	AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
 	AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
 	AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
+	AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY = BIT(5),
 };
 
 unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -2229,6 +2230,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
 		pr_info("debug: enable RAS ACA\n");
 		adev->debug_enable_ras_aca = true;
 	}
+
+	if (amdgpu_debug_mask & AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY) {
+		pr_info("debug: enable mode2 reset for poison consumption recovery");
+		adev->debug_mode2_for_poison_recovery = true;
+	}
 }
 
 static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 816800555f7f..a355b2bc2214 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -164,10 +164,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 	case SOC15_IH_CLIENTID_SE3SH:
 	case SOC15_IH_CLIENTID_UTCL2:
 		block = AMDGPU_RAS_BLOCK__GFX;
-		if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3))
-			reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-		else
+		if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) {
+			reset = ((dev->adev->debug_mode2_for_poison_recovery) ?
+				 AMDGPU_RAS_GPU_RESET_MODE2_RESET : AMDGPU_RAS_GPU_RESET_MODE1_RESET);
+		} else {
 			reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+		}
 		break;
 	case SOC15_IH_CLIENTID_VMC:
 	case SOC15_IH_CLIENTID_VMC1:
@@ -180,10 +182,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 	case SOC15_IH_CLIENTID_SDMA3:
 	case SOC15_IH_CLIENTID_SDMA4:
 		block = AMDGPU_RAS_BLOCK__SDMA;
-		if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3))
-			reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-		else
+		if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) {
+			reset = ((dev->adev->debug_mode2_for_poison_recovery) ?
+				 AMDGPU_RAS_GPU_RESET_MODE2_RESET : AMDGPU_RAS_GPU_RESET_MODE1_RESET);
+		} else {
 			reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+		}
 		break;
 	default:
 		dev_warn(dev->adev->dev,
-- 
2.17.1



More information about the amd-gfx mailing list