[PATCH] drm/amdgpu: Update EEPROM RAS table for mismatched table version
Zhou1, Tao
Tao.Zhou1 at amd.com
Fri Mar 29 07:32:16 UTC 2024
[AMD Official Use Only - General]
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Candice Li
> Sent: Wednesday, March 27, 2024 2:16 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Li, Candice <Candice.Li at amd.com>
> Subject: [PATCH] drm/amdgpu: Update EEPROM RAS table for mismatched table
> version
>
> Update table version and restore bad page records to EEPROM RAS table for
> mismatched table version case. Otherwise force to reset the table.
>
> Signed-off-by: Candice Li <candice.li at amd.com>
> ---
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 88 ++++++++++++++++---
> 1 file changed, 78 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 06a62a8a992e9b..42d0ef2f512474 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -1319,6 +1319,37 @@ static int __read_table_ras_info(struct
> amdgpu_ras_eeprom_control *control)
> return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; }
>
> +static bool amdgpu_ras_eeprom_table_version_validate(struct
> +amdgpu_ras_eeprom_control *control) {
> + struct amdgpu_device *adev = to_amdgpu_device(control);
> + struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
> +
> + switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
> + case IP_VERSION(8, 10, 0):
> + case IP_VERSION(12, 0, 0):
> + return hdr->version == RAS_TABLE_VER_V2_1;
> + default:
> + return hdr->version == RAS_TABLE_VER_V1;
> + }
> +}
> +
> +static void amdgpu_ras_update_eeprom_control(struct
> +amdgpu_ras_eeprom_table_header *hdr) {
> + struct amdgpu_ras_eeprom_control *control =
> + container_of(hdr, struct amdgpu_ras_eeprom_control, tbl_hdr);
> +
> + if (hdr->version == RAS_TABLE_VER_V2_1) {
> + control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
> + control->ras_record_offset = RAS_RECORD_START_V2_1;
> + control->ras_max_record_count =
> RAS_MAX_RECORD_COUNT_V2_1;
> + } else {
> + control->ras_num_recs = RAS_NUM_RECS(hdr);
> + control->ras_record_offset = RAS_RECORD_START;
> + control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
> + }
> + control->ras_fri = RAS_OFFSET_TO_INDEX(control,
> +hdr->first_rec_offset); }
> +
> int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> bool *exceed_err_limit)
> {
> @@ -1326,7 +1357,9 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control,
> unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
> struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
> struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> - int res;
> + int res, res1;
> + struct eeprom_table_record *bps;
> + u32 num_recs;
>
> *exceed_err_limit = false;
>
> @@ -1355,16 +1388,51 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control,
>
> __decode_table_header_from_buf(hdr, buf);
>
> - if (hdr->version == RAS_TABLE_VER_V2_1) {
> - control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
> - control->ras_record_offset = RAS_RECORD_START_V2_1;
> - control->ras_max_record_count =
> RAS_MAX_RECORD_COUNT_V2_1;
> - } else {
> - control->ras_num_recs = RAS_NUM_RECS(hdr);
> - control->ras_record_offset = RAS_RECORD_START;
> - control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
> + amdgpu_ras_update_eeprom_control(hdr);
> +
> + if (!amdgpu_ras_eeprom_table_version_validate(control)) {
> + num_recs = control->ras_num_recs;
> + if (num_recs && amdgpu_bad_page_threshold) {
> + /* Save bad page records existed in EEPROM */
> + bps = kcalloc(num_recs, sizeof(*bps), GFP_KERNEL);
> + if (!bps)
> + return -ENOMEM;
> +
> + res1 = amdgpu_ras_eeprom_read(control, bps,
> num_recs);
> + if (res1)
> + dev_warn(adev->dev, "Fail to load EEPROM
> table, force to reset
> +it.");
> +
> + res = amdgpu_ras_eeprom_reset_table(control);
> + if (res) {
> + dev_err(adev->dev, "Failed to create a new
> EEPROM table.");
> + kfree(bps);
> + return res < 0 ? res : 0;
> + }
> +
> + if (!res1) {
> + /* Update the EEPROM table with correct table
> version and
> + * original bad page records
> + */
> + amdgpu_ras_update_eeprom_control(hdr);
> + res = amdgpu_ras_eeprom_append(control, bps,
> num_recs);
> +
> + if (res) {
> + dev_warn(adev->dev, "Fail to update
> EEPROM table, force to reset it.");
> + res =
> amdgpu_ras_eeprom_reset_table(control);
[Tao] I think the reset here can be dropped, apart from this, the patch is:
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
It's better to get Stanley's Reviewed-by.
> + }
> + }
> +
> + kfree(bps);
> + } else
> + res = amdgpu_ras_eeprom_reset_table(control);
> +
> + if (res) {
> + dev_err(adev->dev, "Failed to reset EEPROM table.");
> + return res < 0 ? res : 0;
> + }
> +
> + amdgpu_ras_update_eeprom_control(hdr);
> }
> - control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
>
> if (hdr->header == RAS_TABLE_HDR_VAL) {
> DRM_DEBUG_DRIVER("Found existing EEPROM table with %d
> records",
> --
> 2.25.1
More information about the amd-gfx
mailing list