[PATCH v2 05/10] drm/amdgpu: Add helper to initialize badpage info
Lijo Lazar
lijo.lazar at amd.com
Wed Sep 11 06:58:53 UTC 2024
Add a separate function to read badpage data during initialization.
Reading bad pages will need hardware access and cannot be done during
reset. Hence in cases where device needs a full reset during
init itself, attempting to read will cause a deadlock.
Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
Reviewed-by: Feifei Xu <Feifei.Xu at amd.com>
Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 56 +++++++++++++++-------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +-
3 files changed, 41 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0caab1a4ae8c..4928881c13b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2953,7 +2953,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
* Note: theoretically, this should be called before all vram allocations
* to protect retired page from abusing
*/
- r = amdgpu_ras_recovery_init(adev);
+ r = amdgpu_ras_recovery_init(adev, true);
if (r)
goto init_failed;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c7cdbd2b5adc..f5cd91fd63ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3146,7 +3146,42 @@ static int amdgpu_ras_page_retirement_thread(void *param)
return 0;
}
-int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ int ret;
+
+ if (!con || amdgpu_sriov_vf(adev))
+ return 0;
+
+ ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
+
+ if (ret)
+ return ret;
+
+ /* HW not usable */
+ if (amdgpu_ras_is_rma(adev))
+ return -EHWPOISON;
+
+ if (con->eeprom_control.ras_num_recs) {
+ ret = amdgpu_ras_load_bad_pages(adev);
+ if (ret)
+ return ret;
+
+ amdgpu_dpm_send_hbm_bad_pages_num(
+ adev, con->eeprom_control.ras_num_recs);
+
+ if (con->update_channel_flag == true) {
+ amdgpu_dpm_send_hbm_bad_channel_flag(
+ adev, con->eeprom_control.bad_channel_bitmap);
+ con->update_channel_flag = false;
+ }
+ }
+
+ return ret;
+}
+
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
@@ -3187,25 +3222,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
*/
if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
return 0;
- ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
- /*
- * This calling fails when is_rma is true or
- * ret != 0.
- */
- if (amdgpu_ras_is_rma(adev) || ret)
- goto free;
-
- if (con->eeprom_control.ras_num_recs) {
- ret = amdgpu_ras_load_bad_pages(adev);
+ if (init_bp_info) {
+ ret = amdgpu_ras_init_badpage_info(adev);
if (ret)
goto free;
-
- amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
-
- if (con->update_channel_flag == true) {
- amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
- con->update_channel_flag = false;
- }
}
mutex_init(&con->page_rsv_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 669720a9c60a..871b2d6278e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -736,8 +736,8 @@ struct amdgpu_ras_block_hw_ops {
* 8: feature disable
*/
-
-int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
+int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev);
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info);
void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev);
--
2.25.1
More information about the amd-gfx
mailing list