[PATCH] drm/amdgpu: format old RAS eeprom data into V3 version
Yang, Stanley
Stanley.Yang at amd.com
Fri Mar 7 07:36:13 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
> Sent: Friday, March 7, 2025 2:47 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH] drm/amdgpu: format old RAS eeprom data into V3 version
>
> Clear old data and save it in V3 format.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 ++++
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 26 ++++++++++---------
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 1 +
> 3 files changed, 20 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 837f33698b38..266f24002e07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3465,6 +3465,11 @@ int amdgpu_ras_init_badpage_info(struct
> amdgpu_device *adev)
> adev, control->bad_channel_bitmap);
> con->update_channel_flag = false;
> }
> +
> + if (control->tbl_hdr.version < RAS_TABLE_VER_V3)
[Stanley]: should check ip_version here, this affect all asics that epprom table version is low then V3.
Regards
Stanley
> + if (!amdgpu_ras_eeprom_reset_table(control))
> + if (amdgpu_ras_save_bad_pages(adev, NULL))
> + dev_warn(adev->dev, "Failed to save
> EEPROM data in V3 format!\n");
> }
>
> return ret;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 09a6f8bc1a5a..71dddb8983ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -413,9 +413,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct
> amdgpu_ras_eeprom_control
>
> switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
> case IP_VERSION(8, 10, 0):
> - case IP_VERSION(12, 0, 0):
> hdr->version = RAS_TABLE_VER_V2_1;
> return;
> + case IP_VERSION(12, 0, 0):
> + hdr->version = RAS_TABLE_VER_V3;
> + return;
> default:
> hdr->version = RAS_TABLE_VER_V1;
> return;
> @@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct
> amdgpu_ras_eeprom_control *control)
> hdr->header = RAS_TABLE_HDR_VAL;
> amdgpu_ras_set_eeprom_table_version(control);
>
> - if (hdr->version == RAS_TABLE_VER_V2_1) {
> + if (hdr->version >= RAS_TABLE_VER_V2_1) {
> hdr->first_rec_offset = RAS_RECORD_START_V2_1;
> hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
> RAS_TABLE_V2_1_INFO_SIZE;
> @@ -461,7 +463,7 @@ int amdgpu_ras_eeprom_reset_table(struct
> amdgpu_ras_eeprom_control *control)
> }
>
> csum = __calc_hdr_byte_sum(control);
> - if (hdr->version == RAS_TABLE_VER_V2_1)
> + if (hdr->version >= RAS_TABLE_VER_V2_1)
> csum += __calc_ras_info_byte_sum(control);
> csum = -csum;
> hdr->checksum = csum;
> @@ -752,7 +754,7 @@ amdgpu_ras_eeprom_update_header(struct
> amdgpu_ras_eeprom_control *control)
> "Saved bad pages %d reaches threshold value %d\n",
> control->ras_num_bad_pages, ras-
> >bad_page_cnt_threshold);
> control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
> - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
> + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
> control->tbl_rai.rma_status =
> GPU_RETIRED__ECC_REACH_THRESHOLD;
> control->tbl_rai.health_percent = 0;
> }
> @@ -765,7 +767,7 @@ amdgpu_ras_eeprom_update_header(struct
> amdgpu_ras_eeprom_control *control)
> amdgpu_dpm_send_rma_reason(adev);
> }
>
> - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
> + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
> control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
> RAS_TABLE_V2_1_INFO_SIZE +
> control->ras_num_recs *
> RAS_TABLE_RECORD_SIZE; @@ -805,7 +807,7 @@
> amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
> * now calculate gpu health percent
> */
> if (amdgpu_bad_page_threshold != 0 &&
> - control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
> + control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
> control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
> control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
> control->ras_num_bad_pages) * 100)
> / @@ -818,7 +820,7 @@ amdgpu_ras_eeprom_update_header(struct
> amdgpu_ras_eeprom_control *control)
> csum += *pp;
>
> csum += __calc_hdr_byte_sum(control);
> - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
> + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
> csum += __calc_ras_info_byte_sum(control);
> /* avoid sign extension when assigning to "checksum" */
> csum = -csum;
> @@ -1035,7 +1037,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct
> amdgpu_ras_eeprom_control *co
> /* get available eeprom table version first before eeprom table init */
> amdgpu_ras_set_eeprom_table_version(control);
>
> - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
> + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
> return RAS_MAX_RECORD_COUNT_V2_1;
> else
> return RAS_MAX_RECORD_COUNT;
> @@ -1280,7 +1282,7 @@ static int __verify_ras_table_checksum(struct
> amdgpu_ras_eeprom_control *control
> int buf_size, res;
> u8 csum, *buf, *pp;
>
> - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
> + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
> buf_size = RAS_TABLE_HEADER_SIZE +
> RAS_TABLE_V2_1_INFO_SIZE +
> control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
> @@ -1383,7 +1385,7 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control)
>
> __decode_table_header_from_buf(hdr, buf);
>
> - if (hdr->version == RAS_TABLE_VER_V2_1) {
> + if (hdr->version >= RAS_TABLE_VER_V2_1) {
> control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
> control->ras_record_offset = RAS_RECORD_START_V2_1;
> control->ras_max_record_count =
> RAS_MAX_RECORD_COUNT_V2_1; @@ -1423,7 +1425,7 @@ int
> amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
> DRM_DEBUG_DRIVER("Found existing EEPROM table with %d
> records",
> control->ras_num_bad_pages);
>
> - if (hdr->version == RAS_TABLE_VER_V2_1) {
> + if (hdr->version >= RAS_TABLE_VER_V2_1) {
> res = __read_table_ras_info(control);
> if (res)
> return res;
> @@ -1443,7 +1445,7 @@ int amdgpu_ras_eeprom_check(struct
> amdgpu_ras_eeprom_control *control)
> ras->bad_page_cnt_threshold);
> } else if (hdr->header == RAS_TABLE_HDR_BAD &&
> amdgpu_bad_page_threshold != 0) {
> - if (hdr->version == RAS_TABLE_VER_V2_1) {
> + if (hdr->version >= RAS_TABLE_VER_V2_1) {
> res = __read_table_ras_info(control);
> if (res)
> return res;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 13f7eda9a696..ec6d7ea37ad0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -28,6 +28,7 @@
>
> #define RAS_TABLE_VER_V1 0x00010000
> #define RAS_TABLE_VER_V2_1 0x00021000
> +#define RAS_TABLE_VER_V3 0x00030000
>
> struct amdgpu_device;
>
> --
> 2.34.1
More information about the amd-gfx
mailing list