[PATCH 3/3] drm/amdgpu: adjust gpu reset sequence for gfx v11_0_3

YiPeng Chai YiPeng.Chai at amd.com
Thu Apr 27 08:02:19 UTC 2023


When gfx ras poison consumption causes gpu reset on gfx v11_0_3,
the sequence of gpu reset is "soft reset -> mode2 reset -> mode1 reset".
If the previous reset fails, fall back to the next reset.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 ++++++++++++++++------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a5086be4d7dd..c8d2a281098f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4770,13 +4770,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	if (job && job->vm)
 		drm_sched_increase_karma(&job->base);
 
-	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
-	/* If reset handler not implemented, continue; otherwise return */
-	if (r == -ENOSYS)
-		r = 0;
-	else
-		return r;
-
 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
 	if (!amdgpu_sriov_vf(adev)) {
 
@@ -4789,12 +4782,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 			r = amdgpu_device_ip_soft_reset(adev);
 			amdgpu_device_ip_post_soft_reset(adev);
 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
-				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
+				struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+				if (ras->reset_by_gfx_poison) {
+					reset_context->method = AMD_RESET_METHOD_MODE2;
+					dev_info(adev->dev, "soft reset failed, will fallback to mode2 reset!\n");
+				} else {
+					dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
+				}
 				need_full_reset = true;
 			}
 		}
 
-		if (need_full_reset)
+		/* IP suspend will affect mode2 reset, so ip suspend is skipped
+		 * when mode2 reset is enabled.
+		 */
+		if (need_full_reset &&
+		    (reset_context->method != AMD_RESET_METHOD_MODE2))
 			r = amdgpu_device_ip_suspend(adev);
 		if (need_full_reset)
 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
@@ -4803,6 +4807,11 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 				  &reset_context->flags);
 	}
 
+	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
+	/* If reset handler not implemented, continue; otherwise return */
+	if (r == -ENOSYS)
+		r = 0;
+
 	return r;
 }
 
@@ -4892,7 +4901,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 	/* If reset handler not implemented, continue; otherwise return */
 	if (r == -ENOSYS)
 		r = 0;
-	else
+	else if (!r) /* Mode2 reset successful, return */
 		return r;
 
 	/* Reset handler not implemented, use the default method */
@@ -4904,6 +4913,17 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 		test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
 			test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
 
+	/* If mode2 reset is enabled, ip suspend is skipped in previous
+	 * amdgpu_device_pre_asic_reset function. but for mode1 reset,
+	 * ip suspend must be called.
+	 */
+	if (need_full_reset &&
+	   (reset_context->method == AMD_RESET_METHOD_MODE2)) {
+		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+			amdgpu_device_ip_suspend(tmp_adev);
+		}
+	}
+
 	/*
 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
 	 * to allow proper links negotiation in FW (within 1 sec)
-- 
2.34.1



More information about the amd-gfx mailing list