[PATCH 4/5] drma/amdgpu: set fatal flag for RAS recovery
Zhang, Hawking
Hawking.Zhang at amd.com
Wed Jun 5 06:51:42 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
Please correct the commit subject before pushing the change
drma->drm
Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Friday, May 31, 2024 18:49
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 4/5] drma/amdgpu: set fatal flag for RAS recovery
PMFW needs the flag to know the reason of mode1.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 2 +-
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 2 +-
7 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index fb5fc1fe6ad0..f55bff59052f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -940,7 +940,7 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
if (adev->gfx.ras && adev->gfx.ras->ras_block.hw_ops &&
adev->gfx.ras->ras_block.hw_ops->query_ras_error_count)
adev->gfx.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, true);
}
return AMDGPU_RAS_SUCCESS;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ff2d34dc9718..2071e30d7e56 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2070,7 +2070,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (poison_stat && !con->is_rma) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
block_obj->ras_comm.name);
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, false);
}
if (!poison_stat)
@@ -2825,7 +2825,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
amdgpu_ras_error_data_fini(&err_data);
if (err_cnt && con->is_rma)
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, false);
mutex_lock(&con->umc_ecc_log.lock);
if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
@@ -2888,7 +2888,7 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
flush_delayed_work(&con->page_retirement_dwork);
con->gpu_reset_flags |= reset;
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, false);
}
return 0;
@@ -3815,7 +3815,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
amdgpu_ras_set_fed(adev, true);
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, true);
}
}
@@ -3996,7 +3996,7 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev,
return ret;
}
-int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal)
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 37e1c93c243d..ed5793458a70 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -878,7 +878,7 @@ bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
int amdgpu_ras_is_supported(struct amdgpu_device *adev, unsigned int block);
-int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal);
struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 151f83ea803b..f976b6deb42d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -129,7 +129,7 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev,
if (amdgpu_sriov_vf(adev))
return AMDGPU_RAS_SUCCESS;
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, true);
return AMDGPU_RAS_SUCCESS;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 4a72ff8d8d80..2596a1c2a64e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -198,7 +198,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
if ((err_data->ue_count || err_data->de_count) &&
(reset || (con && con->is_rma))) {
con->gpu_reset_flags |= reset;
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, false);
}
return AMDGPU_RAS_SUCCESS;
@@ -247,7 +247,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
if (reset || (err_data.err_addr_cnt && con && con->is_rma)) {
con->gpu_reset_flags |= reset;
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, false);
}
return 0;
@@ -266,7 +266,7 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
* let MCA notifier do page retirement.
*/
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, false);
}
return ret;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 9cd221ed240c..07c24704c4b8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -98,7 +98,7 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
}
if (con && !con->is_rma)
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, false);
}
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index b8fc9e126e0d..0935ed57a906 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -414,7 +414,7 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
/* ras_controller_int is dedicated for nbif ras error,
* not the global interrupt for sync flood
*/
- amdgpu_ras_reset_gpu(adev);
+ amdgpu_ras_reset_gpu(adev, true);
}
amdgpu_ras_error_data_fini(&err_data);
--
2.34.1
More information about the amd-gfx
mailing list