[PATCH 4/9] drm/amdgpu: break driver init process when it's bad GPU
Guchun Chen
guchun.chen at amd.com
Thu Jul 23 08:33:41 UTC 2020
When retrieving bad gpu tag from eeprom, GPU init should
fail as the GPU needs to be retired for further check.
Signed-off-by: Guchun Chen <guchun.chen at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 +++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 11 ++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 ++-
4 files changed, 34 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2662cd7c8685..882f8a0964a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
* it should be called after amdgpu_device_ip_hw_init_phase2 since
* for some ASICs the RAS EEPROM code relies on SMU fully functioning
* for I2C communication which only true at this point.
- * recovery_init may fail, but it can free all resources allocated by
- * itself and its failure should not stop amdgpu init process.
+ *
+ * amdgpu_ras_recovery_init may fail, but the upper only cares the
+ * failure from bad gpu situation and stop amdgpu init process
+ * arrordingly. For other failed cases, it will still release all
+ * the resource and print error message, rather than returning one
+ * negative value to upper level.
*
* Note: theoretically, this should be called before all vram allocations
* to protect retired page from abusing
*/
- amdgpu_ras_recovery_init(adev);
+ r = amdgpu_ras_recovery_init(adev);
+ if (r)
+ goto init_failed;
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8daeb54917ed..06db2f0b78d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1819,6 +1819,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
+ bool bad_gpu = false;
int ret;
if (con)
@@ -1840,9 +1841,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
- ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
- if (ret)
+ ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &bad_gpu);
+ /* We only fail this calling and halt booting when bad_gpu is true. */
+ if (bad_gpu) {
+ ret = -EINVAL;
goto free;
+ }
if (con->eeprom_control.num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
@@ -1865,6 +1869,13 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
out:
dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
+ /*
+ * Except bad_gpu case, other failure cases in this function
+ * would not fail amdgpu driver init.
+ */
+ if (!bad_gpu)
+ ret = 0;
+
return ret;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 35c0c849d49b..3f1b167afe6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -241,12 +241,14 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
}
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+ bool *bad_gpu)
{
int ret = 0;
struct amdgpu_device *adev = to_amdgpu_device(control);
unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 };
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
struct i2c_msg msg = {
.addr = 0,
.flags = I2C_M_RD,
@@ -254,6 +256,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
.buf = buff,
};
+ *bad_gpu = false;
+
/* Verify i2c adapter is initialized */
if (!adev->pm.smu_i2c.algo)
return -ENOENT;
@@ -282,6 +286,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
control->num_recs);
+ } else if ((ras->bad_page_cnt_threshold != 0xFFFFFFFF) && (
+ hdr->header == EEPROM_TABLE_HDR_BAD)) {
+ *bad_gpu = true;
+ DRM_ERROR("Detect BAD GPU TAG in eeprom table and "
+ "GPU shall be retired.\n");
} else {
DRM_INFO("Creating new EEPROM table");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index b272840cb069..a2de243da31d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -77,7 +77,8 @@ struct eeprom_table_record {
unsigned char mcumc_id;
}__attribute__((__packed__));
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+ bool *bad_gpu);
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
--
2.17.1
More information about the amd-gfx
mailing list