[PATCH 1/2] drm/amdgpu: refine eeprom data check

Wed Jul 9 08:40:06 UTC 2025

On 7/9/2025 1:59 PM, ganglxie wrote:
> add eeprom data checksum check after data writing, after gpu
> reset, and before driver unload. reset eeprom and save correct
> data to eeprom when check failed
> 
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 13 ++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  7 +++++-
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 25 +++++++++++++++++++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       |  1 +
>  6 files changed, 48 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 84fcaf84fead..2503f2300472 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6410,6 +6410,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>  	struct amdgpu_hive_info *hive = NULL;
>  	int r = 0;
>  	bool need_emergency_restart = false;
> +	enum amd_reset_method tmp_reset_method;
>  
>  	if (amdgpu_reset_domain_in_drain_mode(adev->reset_domain))
>  		return 0;
> @@ -6506,6 +6507,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>  	if (!r)
>  		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
>  
> +	if (reset_context->method == AMD_RESET_METHOD_MODE1 ||
> +	    reset_context->method == AMD_RESET_METHOD_MODE2) {
> +		amdgpu_ras_eeprom_check_and_recover(adev);
> +	} else {
> +		if (reset_context->method == AMD_RESET_METHOD_NONE) {
> +			tmp_reset_method = adev->asic_funcs->reset_method(adev);
> +			if (tmp_reset_method == AMD_RESET_METHOD_MODE1 ||
> +			    tmp_reset_method == AMD_RESET_METHOD_MODE2)
> +				amdgpu_ras_eeprom_check_and_recover(adev);
> +		}
> +	}
> +

It's not required to do this for every reset. You may check reset
context source as RAS and then do a recheck.

>  	return r;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 571b70da4562..1009b60f6ae4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2560,6 +2560,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
>  	struct drm_device *dev = pci_get_drvdata(pdev);
>  	struct amdgpu_device *adev = drm_to_adev(dev);
>  
> +	amdgpu_ras_eeprom_check_and_recover(adev);
>  	amdgpu_xcp_dev_unplug(adev);
>  	amdgpu_gmc_prepare_nps_mode_change(adev);
>  	drm_dev_unplug(dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index f8a8c8502013..e03550be45b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -196,6 +196,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
>  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
>  					 err_data.err_addr_cnt, false);
>  		amdgpu_ras_save_bad_pages(adev, NULL);
> +		amdgpu_ras_eeprom_check_and_recover(adev);

As mentioned earlier also, there should be some way to identify when
this is really required. We had plenty of issues in the past with
delayed resets during RAS errors. Introducing more handshakes/I2C
transactions with PMFW is strictly discouraged.

Thanks,
Lijo

>  	}
>  
>  	amdgpu_ras_error_data_fini(&err_data);
> @@ -3539,9 +3540,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
>  		/* The format action is only applied to new ASICs */
>  		if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
>  		    control->tbl_hdr.version < RAS_TABLE_VER_V3)
> -			if (!amdgpu_ras_eeprom_reset_table(control))
> +			if (!amdgpu_ras_eeprom_reset_table(control)) {
>  				if (amdgpu_ras_save_bad_pages(adev, NULL))
>  					dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
> +				else
> +					amdgpu_ras_eeprom_check_and_recover(adev);
> +			}
> +
>  	}
>  
>  	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 2af14c369bb9..2458c67526c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -1522,3 +1522,28 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
>  
>  	return res < 0 ? res : 0;
>  }
> +
> +void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +	struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
> +	int res = 0;
> +
> +	if (!control->is_eeprom_valid)
> +		return;
> +	res = __verify_ras_table_checksum(control);
> +	if (res) {
> +		dev_warn(adev->dev,
> +			"RAS table incorrect checksum or error:%d, try to recover\n",
> +			res);
> +		if (!amdgpu_ras_eeprom_reset_table(control))
> +			if (!amdgpu_ras_save_bad_pages(adev, NULL))
> +				if (!__verify_ras_table_checksum(control)) {
> +					dev_info(adev->dev, "RAS table recovery succeed\n");
> +					return;
> +				}
> +		dev_err(adev->dev, "RAS table recovery failed\n");
> +		control->is_eeprom_valid = false;
> +	}
> +	return;
> +}
> \ No newline at end of file
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 35c69ac3dbeb..ebfca4cb5688 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -161,6 +161,8 @@ void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
>  
>  int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control);
>  
> +void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev);
> +
>  extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
>  extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index bfc86f1e84e5..081f7fd29394 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -80,6 +80,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
>  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
>  						err_data.err_addr_cnt, false);
>  		amdgpu_ras_save_bad_pages(adev, NULL);
> +		amdgpu_ras_eeprom_check_and_recover(adev);
>  	}
>  
>  out_free_err_addr: