[PATCH 3/3] drm/amdgpu: adjust gpu reset sequence for gfx v11_0_3
YiPeng Chai
YiPeng.Chai at amd.com
Thu Apr 27 08:02:19 UTC 2023
When gfx ras poison consumption causes gpu reset on gfx v11_0_3,
the sequence of gpu reset is "soft reset -> mode2 reset -> mode1 reset".
If the previous reset fails, fall back to the next reset.
Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 ++++++++++++++++------
1 file changed, 30 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a5086be4d7dd..c8d2a281098f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4770,13 +4770,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (job && job->vm)
drm_sched_increase_karma(&job->base);
- r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
- /* If reset handler not implemented, continue; otherwise return */
- if (r == -ENOSYS)
- r = 0;
- else
- return r;
-
/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
if (!amdgpu_sriov_vf(adev)) {
@@ -4789,12 +4782,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
r = amdgpu_device_ip_soft_reset(adev);
amdgpu_device_ip_post_soft_reset(adev);
if (r || amdgpu_device_ip_check_soft_reset(adev)) {
- dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+ if (ras->reset_by_gfx_poison) {
+ reset_context->method = AMD_RESET_METHOD_MODE2;
+ dev_info(adev->dev, "soft reset failed, will fallback to mode2 reset!\n");
+ } else {
+ dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
+ }
need_full_reset = true;
}
}
- if (need_full_reset)
+ /* IP suspend will affect mode2 reset, so ip suspend is skipped
+ * when mode2 reset is enabled.
+ */
+ if (need_full_reset &&
+ (reset_context->method != AMD_RESET_METHOD_MODE2))
r = amdgpu_device_ip_suspend(adev);
if (need_full_reset)
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
@@ -4803,6 +4807,11 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
&reset_context->flags);
}
+ r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
+ /* If reset handler not implemented, continue; otherwise return */
+ if (r == -ENOSYS)
+ r = 0;
+
return r;
}
@@ -4892,7 +4901,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
/* If reset handler not implemented, continue; otherwise return */
if (r == -ENOSYS)
r = 0;
- else
+ else if (!r) /* Mode2 reset successful, return */
return r;
/* Reset handler not implemented, use the default method */
@@ -4904,6 +4913,17 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
+ /* If mode2 reset is enabled, ip suspend is skipped in previous
+ * amdgpu_device_pre_asic_reset function. but for mode1 reset,
+ * ip suspend must be called.
+ */
+ if (need_full_reset &&
+ (reset_context->method == AMD_RESET_METHOD_MODE2)) {
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+ amdgpu_device_ip_suspend(tmp_adev);
+ }
+ }
+
/*
* ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
--
2.34.1
More information about the amd-gfx
mailing list