[PATCH 2/2] drm/amdgpu: exclude duplicate pages from UMC RAS UE count

Zhou1, Tao Tao.Zhou1 at amd.com
Wed Feb 22 02:53:29 UTC 2023


Ping...

> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Sent: Monday, February 20, 2023 11:17 AM
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai,
> Thomas <YiPeng.Chai at amd.com>; Li, Candice <Candice.Li at amd.com>; Lazar,
> Lijo <Lijo.Lazar at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 2/2] drm/amdgpu: exclude duplicate pages from UMC RAS UE
> count
> 
> If a UMC bad page is reserved but not freed by an application, the application
> may trigger uncorrectable error repeatly by accessing the page.
> 
> v2: add specific function to do the check.
> v3: remove duplicate pages, calculate new added bad page number.
> v4: reuse save_bad_pages to calculate new added bad page number.
> 
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +++++++++++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  5 +++--
>  3 files changed, 18 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6e543558386d..5c02c6c9f773 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -176,7 +176,7 @@ static int amdgpu_reserve_page_direct(struct
> amdgpu_device *adev, uint64_t addre
>  	if (amdgpu_bad_page_threshold != 0) {
>  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
>  					 err_data.err_addr_cnt);
> -		amdgpu_ras_save_bad_pages(adev);
> +		amdgpu_ras_save_bad_pages(adev, NULL);
>  	}
> 
>  	dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES
> AND WILL CORRUPT RAS EEPROM\n"); @@ -2084,22 +2084,32 @@ int
> amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>  /*
>   * write error record array to eeprom, the function should be
>   * protected by recovery_lock
> + * new_cnt: new added UE count, excluding reserved bad pages, can be
> + NULL
>   */
> -int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
> +int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
> +		unsigned long *new_cnt)
>  {
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  	struct ras_err_handler_data *data;
>  	struct amdgpu_ras_eeprom_control *control;
>  	int save_count;
> 
> -	if (!con || !con->eh_data)
> +	if (!con || !con->eh_data) {
> +		if (new_cnt)
> +			*new_cnt = 0;
> +
>  		return 0;
> +	}
> 
>  	mutex_lock(&con->recovery_lock);
>  	control = &con->eeprom_control;
>  	data = con->eh_data;
>  	save_count = data->count - control->ras_num_recs;
>  	mutex_unlock(&con->recovery_lock);
> +
> +	if (new_cnt)
> +		*new_cnt = save_count / adev->umc.retire_unit;
> +
>  	/* only new entries are saved */
>  	if (save_count > 0) {
>  		if (amdgpu_ras_eeprom_append(control,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index f2ad999993f6..ef38f4c93df0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -547,7 +547,8 @@ int amdgpu_ras_query_error_count(struct
> amdgpu_device *adev,  int amdgpu_ras_add_bad_pages(struct amdgpu_device
> *adev,
>  		struct eeprom_table_record *bps, int pages);
> 
> -int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
> +int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
> +		unsigned long *new_cnt);
> 
>  static inline enum ta_ras_block
>  amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) { diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 1c7fcb4f2380..7c6fc3214339 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -68,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct
> amdgpu_device *adev,
>  	if (amdgpu_bad_page_threshold != 0) {
>  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
>  						err_data.err_addr_cnt);
> -		amdgpu_ras_save_bad_pages(adev);
> +		amdgpu_ras_save_bad_pages(adev, NULL);
>  	}
> 
>  out:
> @@ -147,7 +147,8 @@ static int amdgpu_umc_do_page_retirement(struct
> amdgpu_device *adev,
>  			err_data->err_addr_cnt) {
>  			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
>  						err_data->err_addr_cnt);
> -			amdgpu_ras_save_bad_pages(adev);
> +
> +			amdgpu_ras_save_bad_pages(adev, &(err_data-
> >ue_count));
> 
>  			amdgpu_dpm_send_hbm_bad_pages_num(adev, con-
> >eeprom_control.ras_num_recs);
> 
> --
> 2.35.1



More information about the amd-gfx mailing list