[PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
Zhang, Hawking
Hawking.Zhang at amd.com
Thu Dec 21 08:35:19 UTC 2023
[AMD Official Use Only - General]
Feel free to drop the check as below since amdgpu_xxx_ras_late_init applies the check for interrupt enablement.
+ amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__SDMA)
+ amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__GFX)
Apart from that, the change is
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Stanley.Yang
Sent: Thursday, December 21, 2023 14:05
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>
Cc: Yang, Stanley <Stanley.Yang at amd.com>
Subject: [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
The ecc_irq is disabled while GPU mode2 reset suspending process, but not be enabled during GPU mode2 reset resume process.
Changed from V1:
only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip
delete amdgpu_ras_late_resume function
Changed from V2:
check umc ras supported before put ecc_irq
Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28 +++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 ++++
4 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 02f4c6f9d4f6..b60a3c1bd0f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, {
struct list_head *reset_device_list = reset_context->reset_device_list;
struct amdgpu_device *tmp_adev = NULL;
+ struct amdgpu_ras *con;
int r;
if (reset_device_list == NULL)
@@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
*/
amdgpu_register_gpu_instance(tmp_adev);
- /* Resume RAS */
+ /* Resume RAS, ecc_irq */
+ con = amdgpu_ras_get_context(tmp_adev);
+ if (!amdgpu_sriov_vf(tmp_adev) && con) {
+ if (tmp_adev->sdma.ras &&
+ amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__SDMA) &&
+ tmp_adev->sdma.ras->ras_block.ras_late_init) {
+ r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
+ &tmp_adev->sdma.ras->ras_block.ras_comm);
+ if (r) {
+ dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
+ goto end;
+ }
+ }
+
+ if (tmp_adev->gfx.ras &&
+ amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__GFX) &&
+ tmp_adev->gfx.ras->ras_block.ras_late_init) {
+ r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
+ &tmp_adev->gfx.ras->ras_block.ras_comm);
+ if (r) {
+ dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
+ goto end;
+ }
+ }
+ }
+
amdgpu_ras_resume(tmp_adev);
/* Update PSP FW topology after reset */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 09cbca596bb5..4048539205cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -1043,6 +1043,10 @@ static int gmc_v10_0_hw_fini(void *handle)
amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+ if (adev->gmc.ecc_irq.funcs &&
+ amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+ amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 416f3e4f0438..e1ca5a599971 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -941,6 +941,11 @@ static int gmc_v11_0_hw_fini(void *handle)
}
amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+
+ if (adev->gmc.ecc_irq.funcs &&
+ amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+ amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
gmc_v11_0_gart_disable(adev);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 205db28a9803..f00e5c8c79b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -2388,6 +2388,10 @@ static int gmc_v9_0_hw_fini(void *handle)
amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+ if (adev->gmc.ecc_irq.funcs &&
+ amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+ amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
return 0;
}
--
2.25.1
More information about the amd-gfx
mailing list