[PATCH 1/2] drm/amdgpu: Add init level for post reset reinit

Fri Nov 15 08:03:31 UTC 2024

When device needs to be reset before initialization, it's not required
for all IPs to be initialized before a reset. In such cases, it needs to
identify whether the IP/feature is initialized for the first time or
whether it's reinitialized after a reset.

Add RESET_RECOVERY init level to identify post reset reinitialization
phase. This only provides a device level identification, IP/features may
choose to track their state independently also.

Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c      |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 24 ++++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   |  5 +++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h   |  2 ++
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c   |  2 ++
 7 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 6a2fd9e4f470..57c1ca055388 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -330,6 +330,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 	}
 
 	list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
+		amdgpu_set_init_level(tmp_adev,
+				      AMDGPU_INIT_LEVEL_RESET_RECOVERY);
 		dev_info(tmp_adev->dev,
 			 "GPU reset succeeded, trying to resume\n");
 		/*TBD: Ideally should clear only GFX, SDMA blocks*/
@@ -377,6 +379,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 							tmp_adev);
 
 		if (!r) {
+			amdgpu_set_init_level(tmp_adev,
+					      AMDGPU_INIT_LEVEL_DEFAULT);
 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
 
 			r = amdgpu_ib_ring_tests(tmp_adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4f72ad4e843f..b8ef89d64704 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -846,6 +846,7 @@ struct amdgpu_mqd {
 enum amdgpu_init_lvl_id {
 	AMDGPU_INIT_LEVEL_DEFAULT,
 	AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
+	AMDGPU_INIT_LEVEL_RESET_RECOVERY,
 };
 
 struct amdgpu_init_level {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0419b37e75a8..415c469c2d80 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -155,6 +155,11 @@ struct amdgpu_init_level amdgpu_init_default = {
 	.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
 };
 
+struct amdgpu_init_level amdgpu_init_recovery = {
+	.level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
+	.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
+};
+
 /*
  * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
  * is used for cases like reset on initialization where the entire hive needs to
@@ -181,6 +186,9 @@ void amdgpu_set_init_level(struct amdgpu_device *adev,
 	case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
 		adev->init_lvl = &amdgpu_init_minimal_xgmi;
 		break;
+	case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
+		adev->init_lvl = &amdgpu_init_recovery;
+		break;
 	case AMDGPU_INIT_LEVEL_DEFAULT:
 		fallthrough;
 	default:
@@ -5445,7 +5453,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
 	struct list_head *device_list_handle;
 	bool full_reset, vram_lost = false;
 	struct amdgpu_device *tmp_adev;
-	int r;
+	int r, init_level;
 
 	device_list_handle = reset_context->reset_device_list;
 
@@ -5454,10 +5462,17 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
 
 	full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
 
+	/**
+	 * If it's reset on init, it's default init level, otherwise keep level
+	 * as recovery level.
+	 */
+	if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
+		init_level = AMDGPU_INIT_LEVEL_DEFAULT;
+	else
+		init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
 	r = 0;
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-		/* After reset, it's default init level */
-		amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
+		amdgpu_set_init_level(tmp_adev, init_level);
 		if (full_reset) {
 			/* post card */
 			amdgpu_ras_clear_err_state(tmp_adev);
@@ -5544,6 +5559,9 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
 
 out:
 		if (!r) {
+			/* IP init is complete now, set level as default */
+			amdgpu_set_init_level(tmp_adev,
+					      AMDGPU_INIT_LEVEL_DEFAULT);
 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
 			r = amdgpu_ib_ring_tests(tmp_adev);
 			if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 4fc0ee01d56b..59a29fa12db3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -343,3 +343,8 @@ void amdgpu_reset_get_desc(struct amdgpu_reset_context *rst_ctxt, char *buf,
 		strscpy(buf, "unknown", len);
 	}
 }
+
+bool amdgpu_reset_in_recovery(struct amdgpu_device *adev)
+{
+	return (adev->init_lvl->level == AMDGPU_INIT_LEVEL_RESET_RECOVERY);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index f8628bc898df..4d9b9701139b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -158,4 +158,6 @@ extern struct amdgpu_reset_handler xgmi_reset_on_init_handler;
 int amdgpu_reset_do_xgmi_reset_on_init(
 	struct amdgpu_reset_context *reset_context);
 
+bool amdgpu_reset_in_recovery(struct amdgpu_device *adev);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
index 9b01e074af47..2594467bdd87 100644
--- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -220,6 +220,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 	int r;
 	struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle;
 
+	amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_RESET_RECOVERY);
 	dev_info(tmp_adev->dev,
 			"GPU reset succeeded, trying to resume\n");
 	r = sienna_cichlid_mode2_restore_ip(tmp_adev);
@@ -237,6 +238,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 
 	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
 
+	amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
 	r = amdgpu_ib_ring_tests(tmp_adev);
 	if (r) {
 		dev_err(tmp_adev->dev,
diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
index e70ebad3f9fa..70569ea906bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
@@ -221,6 +221,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 	int r;
 	struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle;
 
+	amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_RESET_RECOVERY);
 	dev_info(tmp_adev->dev,
 			"GPU reset succeeded, trying to resume\n");
 	r = smu_v13_0_10_mode2_restore_ip(tmp_adev);
@@ -234,6 +235,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 
 	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
 
+	amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
 	r = amdgpu_ib_ring_tests(tmp_adev);
 	if (r) {
 		dev_err(tmp_adev->dev,
-- 
2.25.1