[RFC PATCH 07/13] drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR
Jiang Liu
gerry at linux.alibaba.com
Wed Jan 8 13:59:59 UTC 2025
Enhance amdgpu_ras_pre_fini() to better support suspend/resume by:
1) fix possible resource leakage. amdgpu_release_ras_context() only
kfree(con) but doesn't release resources associated with the con
object.
2) call amdgpu_ras_pre_fini() in amdgpu_device_suspend() to undo what
has been done by amdgpu_ras_late_init(), because amdgpu_device_resume()
will invoke amdgpu_ras_late_init() on resume.
3) move amdgpu_ras_recovery_fini() from amdgpu_ras_pre_fini() to
amdgpu_ras_fini()
4) move calling of `obj->ras_fini()` from amdgpu_ras_fini() to
amdgpu_ras_pre_fini().
Signed-off-by: Jiang Liu <gerry at linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 45 ++++++++++++++--------
2 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 602322bd7cb8..7ee8f9d73c3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4599,6 +4599,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return 0;
release_ras_con:
+ amdgpu_ras_pre_fini(adev);
+ amdgpu_ras_fini(adev);
if (amdgpu_sriov_vf(adev))
amdgpu_virt_release_full_gpu(adev, true);
@@ -4613,8 +4615,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->virt.ops = NULL;
r = -EAGAIN;
}
- amdgpu_release_ras_context(adev);
-
failed:
amdgpu_vf_error_trans_all(adev);
@@ -4875,6 +4875,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
cancel_delayed_work_sync(&adev->delayed_init_work);
+ /* disable ras feature must before hw fini */
+ amdgpu_ras_pre_fini(adev);
amdgpu_ras_suspend(adev);
amdgpu_device_ip_suspend_phase1(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 040969d56541..bc90035ee25b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4064,42 +4064,50 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_ras_block_list *node, *tmp;
+ struct amdgpu_ras_block_object *obj;
- if (!adev->ras_enabled || !con)
- return 0;
+ if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev))
+ goto disable;
+ list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
+ obj = node->ras_obj;
+ if (!obj) {
+ continue;
+ }
+
+ if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
+ continue;
+
+ if (obj->ras_fini) {
+ obj->ras_fini(adev, &obj->ras_comm);
+ } else
+ amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
+ }
+disable:
/* Need disable ras on all IPs here before ip [hw/sw]fini */
- if (AMDGPU_RAS_GET_FEATURES(con->features))
+ if (con && AMDGPU_RAS_GET_FEATURES(con->features))
amdgpu_ras_disable_all_features(adev, 0);
- amdgpu_ras_recovery_fini(adev);
+
return 0;
}
int amdgpu_ras_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras_block_list *ras_node, *tmp;
- struct amdgpu_ras_block_object *obj = NULL;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
if (!adev->ras_enabled || !con)
- return 0;
+ goto out_free_context;
list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
- if (ras_node->ras_obj) {
- obj = ras_node->ras_obj;
- if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
- obj->ras_fini)
- obj->ras_fini(adev, &obj->ras_comm);
- else
- amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
- }
-
/* Clear ras blocks from ras_list and free ras block list node */
list_del(&ras_node->node);
kfree(ras_node);
}
+ amdgpu_ras_recovery_fini(adev);
amdgpu_ras_fs_fini(adev);
amdgpu_ras_interrupt_remove_all(adev);
@@ -4117,8 +4125,11 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
cancel_delayed_work_sync(&con->ras_counte_delay_work);
- amdgpu_ras_set_context(adev, NULL);
- kfree(con);
+out_free_context:
+ if (con) {
+ amdgpu_ras_set_context(adev, NULL);
+ kfree(con);
+ }
return 0;
}
--
2.43.5
More information about the amd-gfx
mailing list