[PATCH 3/4] drm/amdgpu: add RAS poison handling for MCA
Tao Zhou
tao.zhou1 at amd.com
Wed Oct 19 08:11:49 UTC 2022
For MCA poison, if unmap queue fails, only gpu reset should be
triggered without page retirement handling, MCA notifier will do it.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 27 +++++++++++++++----------
1 file changed, 16 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9494fa14db9a..dd1b1a612343 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -184,18 +184,23 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
bool reset)
{
int ret;
- struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
- struct ras_common_if head = {
- .block = AMDGPU_RAS_BLOCK__UMC,
- };
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
- ret =
- amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
-
- if (ret == AMDGPU_RAS_SUCCESS && obj) {
- obj->err_data.ue_count += err_data->ue_count;
- obj->err_data.ce_count += err_data->ce_count;
+ if (adev->gmc.xgmi.connected_to_cpu) {
+ ret = amdgpu_umc_poison_handler_mca(adev, ras_error_status, reset);
+ } else {
+ struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+ struct ras_common_if head = {
+ .block = AMDGPU_RAS_BLOCK__UMC,
+ };
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+ ret =
+ amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+
+ if (ret == AMDGPU_RAS_SUCCESS && obj) {
+ obj->err_data.ue_count += err_data->ue_count;
+ obj->err_data.ce_count += err_data->ce_count;
+ }
}
return ret;
--
2.35.1
More information about the amd-gfx
mailing list