[RFC PATCH 11/13] drm/amdgpu/sdma: improve the way to manage irq reference count
Jiang Liu
gerry at linux.alibaba.com
Wed Jan 8 14:00:03 UTC 2025
Refactor sdma related code to improve the way to manage irq reference
count. Originally amdgpu_irq_get() is called from ip_blocks[].late_init
and amdgpu_irq_put is called from ip_blocks[].hw_fini. The asymmetric
design may cause issue under certain conditions. So
1) introduce amdgpu_sdma_ras_early_fini() to undo work done by
amdgpu_sdma_ras_late_init().
2) remove call of amdgpu_irq_put in xxxx_hw_fini().
3) call amdgpu_irq_get() in function sdma_v4_4_2_xcp_resume() to keep
irq reference count balanced. Currently sdma_v4_4_2_xcp_resume()
doesn't invoke ip_blocks[].late_init(amdgpu_irq_get), but
sdma_v4_4_2_xcp_suspend() invokes amdgpu_irq_put(), thus causes
unbalanced irq reference count. Fix it by calling amdgpu_irq_get()
in function sdma_v4_4_2_xcp_resume().
Signed-off-by: Jiang Liu <gerry at linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 26 ++++++++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 2 ++
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 8 --------
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 23 ++++++++++++---------
5 files changed, 40 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index fa19c5391d8c..ff5907f2c544 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -383,7 +383,7 @@ enum amdgpu_marker {
AMDGPU_MARKER_RAS_DEBUGFS = 63,
};
-#define AMDGPU_MARKER_INDEX_IRQ(idx) (AMDGPU_MARKER_INDEX_IRQ0 + (idx))
+#define AMDGPU_MARKER_IRQ(idx) (AMDGPU_MARKER_IRQ0 + (idx))
struct amdgpu_ip_block_status {
bool valid;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 21938e858d55..799bcd9978da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -110,16 +110,35 @@ int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
AMDGPU_SDMA_IRQ_INSTANCE0 + i);
if (r)
goto late_fini;
+ amdgpu_ras_set_marker(adev, ras_block, AMDGPU_MARKER_IRQ(i));
}
}
return 0;
late_fini:
- amdgpu_ras_block_early_fini(adev, ras_block);
+ amdgpu_sdma_ras_early_fini(adev, ras_block);
return r;
}
+void amdgpu_sdma_ras_early_fini(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block)
+{
+ int i;
+
+ if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+ for (i = 0; i < adev->sdma.num_instances; i++) {
+ if (amdgpu_ras_test_and_clear_marker(adev, ras_block,
+ AMDGPU_MARKER_IRQ(i))) {
+ amdgpu_irq_put(adev, &adev->sdma.ecc_irq,
+ AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+ }
+ }
+ }
+
+ amdgpu_ras_block_early_fini(adev, ras_block);
+}
+
int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
struct amdgpu_iv_entry *entry)
@@ -334,8 +353,11 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
adev->sdma.ras_if = &ras->ras_block.ras_comm;
/* If not define special ras_late_init function, use default ras_late_init */
- if (!ras->ras_block.ras_late_init)
+ if (!ras->ras_block.ras_late_init) {
+ WARN_ON(ras->ras_block.ras_early_fini);
ras->ras_block.ras_late_init = amdgpu_sdma_ras_late_init;
+ ras->ras_block.ras_early_fini = amdgpu_sdma_ras_early_fini;
+ }
/* If not defined special ras_cb function, use default ras_cb */
if (!ras->ras_block.ras_cb)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 087ce0f6fa07..1915e6c9be63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -164,6 +164,8 @@ int amdgpu_sdma_get_index_from_ring(struct amdgpu_ring *ring, uint32_t *index);
uint64_t amdgpu_sdma_get_csa_mc_addr(struct amdgpu_ring *ring, unsigned vmid);
int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block);
+void amdgpu_sdma_ras_early_fini(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block);
int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index ccf0d531776d..369d7094a3ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1968,18 +1968,10 @@ static int sdma_v4_0_hw_init(struct amdgpu_ip_block *ip_block)
static int sdma_v4_0_hw_fini(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
- int i;
if (amdgpu_sriov_vf(adev))
return 0;
- if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
- for (i = 0; i < adev->sdma.num_instances; i++) {
- amdgpu_irq_put(adev, &adev->sdma.ecc_irq,
- AMDGPU_SDMA_IRQ_INSTANCE0 + i);
- }
- }
-
sdma_v4_0_ctx_switch_enable(adev, false);
sdma_v4_0_enable(adev, false);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 9c7cea0890c9..744569bbc1e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1486,19 +1486,11 @@ static int sdma_v4_4_2_hw_fini(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
uint32_t inst_mask;
- int i;
if (amdgpu_sriov_vf(adev))
return 0;
inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
- if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
- for (i = 0; i < adev->sdma.num_instances; i++) {
- amdgpu_irq_put(adev, &adev->sdma.ecc_irq,
- AMDGPU_SDMA_IRQ_INSTANCE0 + i);
- }
- }
-
sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask);
sdma_v4_4_2_inst_enable(adev, false, inst_mask);
@@ -2153,14 +2145,24 @@ const struct amdgpu_ip_block_version sdma_v4_4_2_ip_block = {
static int sdma_v4_4_2_xcp_resume(void *handle, uint32_t inst_mask)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
- int r;
+ uint32_t tmp_mask = inst_mask;
+ int r, i;
if (!amdgpu_sriov_vf(adev))
sdma_v4_4_2_inst_init_golden_registers(adev, inst_mask);
r = sdma_v4_4_2_inst_start(adev, inst_mask);
+ if (r)
+ return r;
- return r;
+ if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+ for_each_inst(i, tmp_mask) {
+ amdgpu_irq_get(adev, &adev->sdma.ecc_irq,
+ AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+ }
+ }
+
+ return 0;
}
static int sdma_v4_4_2_xcp_suspend(void *handle, uint32_t inst_mask)
@@ -2366,6 +2368,7 @@ static struct amdgpu_sdma_ras sdma_v4_4_2_ras = {
.ras_block = {
.hw_ops = &sdma_v4_4_2_ras_hw_ops,
.ras_late_init = sdma_v4_4_2_ras_late_init,
+ .ras_early_fini = amdgpu_sdma_ras_early_fini,
},
};
--
2.43.5
More information about the amd-gfx
mailing list