[PATCH 1/2] drm/amdgpu: refine eeprom data check
Zhou1, Tao
Tao.Zhou1 at amd.com
Thu Jul 10 06:50:06 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: Xie, Patrick <Gangliang.Xie at amd.com>
> Sent: Wednesday, July 9, 2025 4:29 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Xie, Patrick <Gangliang.Xie at amd.com>
> Subject: [PATCH 1/2] drm/amdgpu: refine eeprom data check
>
> add eeprom data checksum check after data writing, after gpu reset, and before
> driver unload. reset eeprom and save correct data to eeprom when check failed
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 ++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++-
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 +++++++++++++++++++
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 1 +
> 6 files changed, 48 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 84fcaf84fead..2503f2300472 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6410,6 +6410,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device
> *adev,
> struct amdgpu_hive_info *hive = NULL;
> int r = 0;
> bool need_emergency_restart = false;
> + enum amd_reset_method tmp_reset_method;
>
> if (amdgpu_reset_domain_in_drain_mode(adev->reset_domain))
> return 0;
> @@ -6506,6 +6507,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device
> *adev,
> if (!r)
> drm_dev_wedged_event(adev_to_drm(adev),
> DRM_WEDGE_RECOVERY_NONE);
>
> + if (reset_context->method == AMD_RESET_METHOD_MODE1 ||
> + reset_context->method == AMD_RESET_METHOD_MODE2) {
> + amdgpu_ras_eeprom_check_and_recover(adev);
> + } else {
> + if (reset_context->method == AMD_RESET_METHOD_NONE) {
> + tmp_reset_method = adev->asic_funcs->reset_method(adev);
> + if (tmp_reset_method == AMD_RESET_METHOD_MODE1 ||
> + tmp_reset_method == AMD_RESET_METHOD_MODE2)
> + amdgpu_ras_eeprom_check_and_recover(adev);
> + }
> + }
[Tao] the logic can be simplified like this:
tmp_reset_method = reset_context->method;
if (tmp_reset_method == AMD_RESET_METHOD_NONE)
tmp_reset_method = adev->asic_funcs->reset_method(adev);
if (tmp_reset_method == AMD_RESET_METHOD_MODE1 ||
tmp_reset_method == AMD_RESET_METHOD_MODE2)
amdgpu_ras_eeprom_check_and_recover(adev);
> +
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 571b70da4562..1009b60f6ae4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2560,6 +2560,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
> struct drm_device *dev = pci_get_drvdata(pdev);
> struct amdgpu_device *adev = drm_to_adev(dev);
>
> + amdgpu_ras_eeprom_check_and_recover(adev);
> amdgpu_xcp_dev_unplug(adev);
> amdgpu_gmc_prepare_nps_mode_change(adev);
> drm_dev_unplug(dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index f8a8c8502013..e03550be45b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -196,6 +196,7 @@ static int amdgpu_reserve_page_direct(struct
> amdgpu_device *adev, uint64_t addre
> amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
> err_data.err_addr_cnt, false);
> amdgpu_ras_save_bad_pages(adev, NULL);
> + amdgpu_ras_eeprom_check_and_recover(adev);
> }
>
> amdgpu_ras_error_data_fini(&err_data);
> @@ -3539,9 +3540,13 @@ int amdgpu_ras_init_badpage_info(struct
> amdgpu_device *adev)
> /* The format action is only applied to new ASICs */
> if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >=
> 12 &&
> control->tbl_hdr.version < RAS_TABLE_VER_V3)
> - if (!amdgpu_ras_eeprom_reset_table(control))
> + if (!amdgpu_ras_eeprom_reset_table(control)) {
> if (amdgpu_ras_save_bad_pages(adev, NULL))
> dev_warn(adev->dev, "Failed to format RAS
> EEPROM data in V3 version!\n");
> + else
> +
> amdgpu_ras_eeprom_check_and_recover(adev);
> + }
> +
> }
>
> return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 2af14c369bb9..2458c67526c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -1522,3 +1522,28 @@ int amdgpu_ras_eeprom_check(struct
> amdgpu_ras_eeprom_control *control)
>
> return res < 0 ? res : 0;
> }
> +
> +void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev) {
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> + struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
> + int res = 0;
> +
> + if (!control->is_eeprom_valid)
> + return;
> + res = __verify_ras_table_checksum(control);
> + if (res) {
> + dev_warn(adev->dev,
> + "RAS table incorrect checksum or error:%d, try to recover\n",
> + res);
> + if (!amdgpu_ras_eeprom_reset_table(control))
> + if (!amdgpu_ras_save_bad_pages(adev, NULL))
> + if (!__verify_ras_table_checksum(control)) {
> + dev_info(adev->dev, "RAS table recovery
> succeed\n");
> + return;
> + }
> + dev_err(adev->dev, "RAS table recovery failed\n");
> + control->is_eeprom_valid = false;
> + }
> + return;
> +}
> \ No newline at end of file
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 35c69ac3dbeb..ebfca4cb5688 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -161,6 +161,8 @@ void amdgpu_ras_debugfs_set_ret_size(struct
> amdgpu_ras_eeprom_control *control);
>
> int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control);
>
> +void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev);
> +
> extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
> extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index bfc86f1e84e5..081f7fd29394 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -80,6 +80,7 @@ int amdgpu_umc_page_retirement_mca(struct
> amdgpu_device *adev,
> amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
> err_data.err_addr_cnt, false);
> amdgpu_ras_save_bad_pages(adev, NULL);
> + amdgpu_ras_eeprom_check_and_recover(adev);
> }
>
> out_free_err_addr:
> --
> 2.34.1
More information about the amd-gfx
mailing list