[PATCH 1/2] drm/amdgpu: refine eeprom data check
Lazar, Lijo
lijo.lazar at amd.com
Fri Jul 11 08:07:50 UTC 2025
On 7/11/2025 1:29 PM, ganglxie wrote:
> add eeprom data checksum check before driver unload. reset eeprom
> and save correct data to eeprom when check failed
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 28 +++++++++++++++++++
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 ++
> 3 files changed, 31 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 571b70da4562..1009b60f6ae4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2560,6 +2560,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
> struct drm_device *dev = pci_get_drvdata(pdev);
> struct amdgpu_device *adev = drm_to_adev(dev);
>
> + amdgpu_ras_eeprom_check_and_recover(adev);
> amdgpu_xcp_dev_unplug(adev);
> amdgpu_gmc_prepare_nps_mode_change(adev);
> drm_dev_unplug(dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 2af14c369bb9..df0e9b87d578 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -1522,3 +1522,31 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
>
> return res < 0 ? res : 0;
> }
> +
> +void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev)
> +{
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> + struct amdgpu_ras_eeprom_control *control;
> + int res = 0;
This initialization could be skipped. With that -
Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>
Thanks,
Lijo
> +
> + if (!ras)
> + return;
> + control = &ras->eeprom_control;
> + if (!control->is_eeprom_valid)
> + return;
> + res = __verify_ras_table_checksum(control);
> + if (res) {
> + dev_warn(adev->dev,
> + "RAS table incorrect checksum or error:%d, try to recover\n",
> + res);
> + if (!amdgpu_ras_eeprom_reset_table(control))
> + if (!amdgpu_ras_save_bad_pages(adev, NULL))
> + if (!__verify_ras_table_checksum(control)) {
> + dev_info(adev->dev, "RAS table recovery succeed\n");
> + return;
> + }
> + dev_err(adev->dev, "RAS table recovery failed\n");
> + control->is_eeprom_valid = false;
> + }
> + return;
> +}
> \ No newline at end of file
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 35c69ac3dbeb..ebfca4cb5688 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -161,6 +161,8 @@ void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
>
> int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control);
>
> +void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev);
> +
> extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
> extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
>
More information about the amd-gfx
mailing list