[PATCH 2/2] drm/amdgpu: exclude duplicate pages from UMC RAS UE count

Wed Feb 22 06:41:19 UTC 2023

[AMD Official Use Only - General]

Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>

Regards,
Stanley
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Sent: Wednesday, February 22, 2023 10:53 AM
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai,
> Thomas <YiPeng.Chai at amd.com>; Li, Candice <Candice.Li at amd.com>; Lazar,
> Lijo <Lijo.Lazar at amd.com>
> Subject: RE: [PATCH 2/2] drm/amdgpu: exclude duplicate pages from UMC
> RAS UE count
> 
> Ping...
> 
> > -----Original Message-----
> > From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> > Sent: Monday, February 20, 2023 11:17 AM
> > To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> > <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>;
> Chai,
> > Thomas <YiPeng.Chai at amd.com>; Li, Candice <Candice.Li at amd.com>;
> Lazar,
> > Lijo <Lijo.Lazar at amd.com>
> > Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> > Subject: [PATCH 2/2] drm/amdgpu: exclude duplicate pages from UMC RAS
> > UE count
> >
> > If a UMC bad page is reserved but not freed by an application, the
> > application may trigger uncorrectable error repeatly by accessing the page.
> >
> > v2: add specific function to do the check.
> > v3: remove duplicate pages, calculate new added bad page number.
> > v4: reuse save_bad_pages to calculate new added bad page number.
> >
> > Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +++++++++++++---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 ++-
> > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  5 +++--
> >  3 files changed, 18 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 6e543558386d..5c02c6c9f773 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -176,7 +176,7 @@ static int amdgpu_reserve_page_direct(struct
> > amdgpu_device *adev, uint64_t addre
> >  	if (amdgpu_bad_page_threshold != 0) {
> >  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
> >  					 err_data.err_addr_cnt);
> > -		amdgpu_ras_save_bad_pages(adev);
> > +		amdgpu_ras_save_bad_pages(adev, NULL);
> >  	}
> >
> >  	dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES
> AND
> > WILL CORRUPT RAS EEPROM\n"); @@ -2084,22 +2084,32 @@ int
> > amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> >  /*
> >   * write error record array to eeprom, the function should be
> >   * protected by recovery_lock
> > + * new_cnt: new added UE count, excluding reserved bad pages, can be
> > + NULL
> >   */
> > -int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
> > +int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
> > +		unsigned long *new_cnt)
> >  {
> >  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> >  	struct ras_err_handler_data *data;
> >  	struct amdgpu_ras_eeprom_control *control;
> >  	int save_count;
> >
> > -	if (!con || !con->eh_data)
> > +	if (!con || !con->eh_data) {
> > +		if (new_cnt)
> > +			*new_cnt = 0;
> > +
> >  		return 0;
> > +	}
> >
> >  	mutex_lock(&con->recovery_lock);
> >  	control = &con->eeprom_control;
> >  	data = con->eh_data;
> >  	save_count = data->count - control->ras_num_recs;
> >  	mutex_unlock(&con->recovery_lock);
> > +
> > +	if (new_cnt)
> > +		*new_cnt = save_count / adev->umc.retire_unit;
> > +
> >  	/* only new entries are saved */
> >  	if (save_count > 0) {
> >  		if (amdgpu_ras_eeprom_append(control,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index f2ad999993f6..ef38f4c93df0 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -547,7 +547,8 @@ int amdgpu_ras_query_error_count(struct
> > amdgpu_device *adev,  int amdgpu_ras_add_bad_pages(struct
> > amdgpu_device *adev,
> >  		struct eeprom_table_record *bps, int pages);
> >
> > -int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
> > +int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
> > +		unsigned long *new_cnt);
> >
> >  static inline enum ta_ras_block
> >  amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) { diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > index 1c7fcb4f2380..7c6fc3214339 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > @@ -68,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct
> > amdgpu_device *adev,
> >  	if (amdgpu_bad_page_threshold != 0) {
> >  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
> >  						err_data.err_addr_cnt);
> > -		amdgpu_ras_save_bad_pages(adev);
> > +		amdgpu_ras_save_bad_pages(adev, NULL);
> >  	}
> >
> >  out:
> > @@ -147,7 +147,8 @@ static int amdgpu_umc_do_page_retirement(struct
> > amdgpu_device *adev,
> >  			err_data->err_addr_cnt) {
> >  			amdgpu_ras_add_bad_pages(adev, err_data-
> >err_addr,
> >  						err_data->err_addr_cnt);
> > -			amdgpu_ras_save_bad_pages(adev);
> > +
> > +			amdgpu_ras_save_bad_pages(adev, &(err_data-
> > >ue_count));
> >
> >  			amdgpu_dpm_send_hbm_bad_pages_num(adev,
> con-
> > >eeprom_control.ras_num_recs);
> >
> > --
> > 2.35.1