[PATCH Review 6/6] drm/amdgpu: Set EEPROM ras info
Zhang, Hawking
Hawking.Zhang at amd.com
Wed Jun 7 06:52:49 UTC 2023
[AMD Official Use Only - General]
Series is
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Stanley.Yang
Sent: Wednesday, June 7, 2023 14:48
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Li, Candice <Candice.Li at amd.com>
Cc: Yang, Stanley <Stanley.Yang at amd.com>
Subject: [PATCH Review 6/6] drm/amdgpu: Set EEPROM ras info
Set EEPROM ras info: rma status, health percent and bad page threshold.
Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 24 +++++++++++++++++++
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 5 ++++
2 files changed, 29 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 9eceb3bc1058..c2e8f6491ac6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -406,6 +406,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) {
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
u8 csum;
int res;
@@ -423,6 +424,14 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE;
+ rai->rma_status = GPU_HEALTH_USABLE;
+ /**
+ * GPU health represented as a percentage.
+ * 0 means worst health, 100 means fully health.
+ */
+ rai->health_percent = 100;
+ /* ecc_page_threshold = 0 means disable bad page retirement */
+ rai->ecc_page_threshold = con->bad_page_cnt_threshold;
} else {
hdr->first_rec_offset = RAS_RECORD_START;
hdr->tbl_size = RAS_TABLE_HEADER_SIZE; @@ -712,6 +721,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
"Saved bad pages %d reaches threshold value %d\n",
control->ras_num_recs, ras->bad_page_cnt_threshold);
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
+ if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
+ control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
+ control->tbl_rai.health_percent = 0;
+ }
}
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) @@ -749,6 +762,17 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
goto Out;
}
+ /**
+ * bad page records have been stored in eeprom,
+ * now calculate gpu health percent
+ */
+ if (amdgpu_bad_page_threshold != 0 &&
+ control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
+ control->ras_num_recs < ras->bad_page_cnt_threshold)
+ control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
+ control->ras_num_recs) * 100) /
+ ras->bad_page_cnt_threshold;
+
/* Recalc the checksum.
*/
csum = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 3c5575c19bf8..6dfd667f3013 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -31,6 +31,11 @@
struct amdgpu_device;
+enum amdgpu_ras_gpu_health_status {
+ GPU_HEALTH_USABLE = 0,
+ GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
+};
+
enum amdgpu_ras_eeprom_err_type {
AMDGPU_RAS_EEPROM_ERR_NA,
AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,
--
2.17.1
More information about the amd-gfx
mailing list