[PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
Zhou1, Tao
Tao.Zhou1 at amd.com
Thu Sep 5 04:04:02 UTC 2019
ras recovery_init should be called after ttm init,
bad page reserve should be put in front of gpu reset since i2c
may be unstable during gpu reset.
add cleanup for recovery_init and recovery_fini
v2: add more comment and print.
remove cancel_work_sync in recovery_init.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
Reviewed-by: Guchun Chen <guchun.chen at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 ---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 39 ++++++++++++++--------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 12 +++++++
4 files changed, 43 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e50861e16cf5..22cd3deab731 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3629,11 +3629,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
break;
}
}
-
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- amdgpu_ras_reserve_bad_pages(tmp_adev);
- }
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e68f43d1cfea..c2b2b0e3515c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1491,16 +1491,17 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
return 0;
}
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data = &con->eh_data;
int ret;
- *data = kmalloc(sizeof(**data),
- GFP_KERNEL|__GFP_ZERO);
- if (!*data)
- return -ENOMEM;
+ *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
+ if (!*data) {
+ ret = -ENOMEM;
+ goto out;
+ }
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
@@ -1509,18 +1510,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
if (ret)
- return ret;
+ goto free;
if (adev->psp.ras.ras->eeprom_control.num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
if (ret)
- return ret;
+ goto free;
ret = amdgpu_ras_reserve_bad_pages(adev);
if (ret)
- return ret;
+ goto release;
}
return 0;
+
+release:
+ amdgpu_ras_release_bad_pages(adev);
+free:
+ con->eh_data = NULL;
+ kfree((*data)->bps);
+ kfree((*data)->bps_bo);
+ kfree(*data);
+out:
+ DRM_WARN("Failed to initilaize ras recovery!\n");
+
+ return ret;
}
static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
@@ -1528,12 +1541,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data = con->eh_data;
+ /* recovery_init failed to init it, fini is useless */
+ if (!data)
+ return 0;
+
cancel_work_sync(&con->recovery_work);
amdgpu_ras_release_bad_pages(adev);
mutex_lock(&con->recovery_lock);
con->eh_data = NULL;
kfree(data->bps);
+ kfree(data->bps_bo);
kfree(data);
mutex_unlock(&con->recovery_lock);
@@ -1625,9 +1643,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
return r;
}
- if (amdgpu_ras_recovery_init(adev))
- goto recovery_out;
-
amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
if (amdgpu_ras_fs_init(adev))
@@ -1642,8 +1657,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
con->hw_supported, con->supported);
return 0;
fs_out:
- amdgpu_ras_recovery_fini(adev);
-recovery_out:
amdgpu_ras_set_context(adev, NULL);
kfree(con);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 96210e18191e..012034d2ae06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
return ras && (ras->supported & (1 << block));
}
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
unsigned int block);
@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ /* save bad page to eeprom before gpu reset,
+ * i2c may be unstable in gpu reset
+ */
+ amdgpu_ras_reserve_bad_pages(adev);
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
schedule_work(&ras->recovery_work);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dcd32d01a579..c05638cf3f3d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -49,6 +49,7 @@
#include "amdgpu_trace.h"
#include "amdgpu_amdkfd.h"
#include "amdgpu_sdma.h"
+#include "amdgpu_ras.h"
#include "bif/bif_4_1_d.h"
static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
@@ -1772,6 +1773,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
adev->gmc.visible_vram_size);
#endif
+ /*
+ * retired pages will be loaded from eeprom and reserved here,
+ * it should be called after ttm init since new bo may be created,
+ * recovery_init may fail, but it can free all resources allocated by
+ * itself and its failure should not stop amdgpu init process.
+ *
+ * Note: theoretically, this should be called before all vram allocations
+ * to protect retired page from abusing
+ */
+ amdgpu_ras_recovery_init(adev);
+
/*
*The reserved vram for firmware must be pinned to the specified
*place on the VRAM, so reserve it early.
--
2.17.1
More information about the amd-gfx
mailing list