[RFC PATCH 12/13] drm/amdgpu/nbio: improve the way to manage irq reference count
Jiang Liu
gerry at linux.alibaba.com
Wed Jan 8 14:00:04 UTC 2025
Refactor nbio related code to improve the way to manage irq reference
count. Originally amdgpu_irq_get() is called from ip_blocks[].late_init
and amdgpu_irq_put is called from ip_blocks[].hw_fini. The asymmetric
design may cause issue under certain conditions. So
1) introduce amdgpu_nbio_ras_early_fini() to undo work done by
amdgpu_nbio_ras_late_init().
2) remove call of amdgpu_irq_put in xxxx_hw_fini().
3) record the status where reference count is held for specific irq.
Signed-off-by: Jiang Liu <gerry at linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 16 +++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 1 +
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 1 +
drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 1 +
drivers/gpu/drm/amd/amdgpu/soc15.c | 16 ----------------
5 files changed, 18 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index c75ce91f94ab..b8a69ceec2e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -64,13 +64,27 @@ int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
r = amdgpu_irq_get(adev, &adev->nbio.ras_controller_irq, 0);
if (r)
goto late_fini;
+ amdgpu_ras_set_marker(adev, ras_block, AMDGPU_MARKER_IRQ0);
r = amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0);
if (r)
goto late_fini;
+ amdgpu_ras_set_marker(adev, ras_block, AMDGPU_MARKER_IRQ1);
}
return 0;
late_fini:
- amdgpu_ras_block_early_fini(adev, ras_block);
+ amdgpu_nbio_ras_early_fini(adev, ras_block);
return r;
}
+
+void amdgpu_nbio_ras_early_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block)
+{
+ if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
+ if (amdgpu_ras_test_and_clear_marker(adev, ras_block, AMDGPU_MARKER_IRQ0))
+ amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
+ if (amdgpu_ras_test_and_clear_marker(adev, ras_block, AMDGPU_MARKER_IRQ1))
+ amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+ }
+
+ amdgpu_ras_block_early_fini(adev, ras_block);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 79c2f807b9fe..e1edf75602c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -117,6 +117,7 @@ struct amdgpu_nbio {
int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
+void amdgpu_nbio_ras_early_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block);
u64 amdgpu_nbio_get_pcie_replay_count(struct amdgpu_device *adev);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 97782a73f4b0..6c727b77bb3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -665,6 +665,7 @@ struct amdgpu_nbio_ras nbio_v7_4_ras = {
},
.hw_ops = &nbio_v7_4_ras_hw_ops,
.ras_late_init = amdgpu_nbio_ras_late_init,
+ .ras_early_fini = amdgpu_nbio_ras_early_fini,
},
.handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring,
.handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index 8a0a63ac88d2..684a38a16247 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -703,6 +703,7 @@ struct amdgpu_nbio_ras nbio_v7_9_ras = {
},
.hw_ops = &nbio_v7_9_ras_hw_ops,
.ras_late_init = amdgpu_nbio_ras_late_init,
+ .ras_early_fini = amdgpu_nbio_ras_early_fini,
},
.handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring,
.handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 6fcdeb265a22..1dca7d7c813c 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1299,22 +1299,6 @@ static int soc15_common_hw_fini(struct amdgpu_ip_block *ip_block)
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_put_irq(adev);
- /*
- * For minimal init, late_init is not called, hence RAS irqs are not
- * enabled.
- */
- if ((!amdgpu_sriov_vf(adev)) &&
- (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) &&
- adev->nbio.ras_if &&
- amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
- if (adev->nbio.ras &&
- adev->nbio.ras->init_ras_controller_interrupt)
- amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
- if (adev->nbio.ras &&
- adev->nbio.ras->init_ras_err_event_athub_interrupt)
- amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
- }
-
return 0;
}
--
2.43.5
More information about the amd-gfx
mailing list