[PATCH 2/5] drm/amdgpu: validate bad page threshold in ras

Wed Jul 22 07:51:55 UTC 2020

[AMD Official Use Only - Internal Distribution Only]

Hi Guchun,

Please see my comment inline.

> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen at amd.com>
> Sent: Wednesday, July 22, 2020 11:14 AM
> To: amd-gfx at lists.freedesktop.org; Deucher, Alexander
> <Alexander.Deucher at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Yang,
> Stanley <Stanley.Yang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>;
> Clements, John <John.Clements at amd.com>
> Cc: Chen, Guchun <Guchun.Chen at amd.com>
> Subject: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras
> 
> Bad page threshold value should be valid in the range between
> -1 and max records length of eeprom. It could determine when the GPU
> should be retired.
> 
> Signed-off-by: Guchun Chen <guchun.chen at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 43
> +++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  3 ++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    |  5 +++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  2 +
>  4 files changed, 53 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6f06e1214622..e3d67d85c55f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -69,6 +69,9 @@ const char *ras_block_string[] = {
>  /* inject address is 52 bits */
>  #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)
> 
> +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
> +#define RAS_BAD_PAGE_RATE		(100 * 1024 * 1024ULL)
> +
>  enum amdgpu_ras_retire_page_reservation {
>  	AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>  	AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -1700,6 +1703,42 @@ static bool amdgpu_ras_check_bad_page(struct
> amdgpu_device *adev,
>  	return ret;
>  }
> 
> +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
> +					uint32_t max_length)
> +{
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	int tmp_threshold = amdgpu_bad_page_threshold;
> +	u64 val;
> +
> +	/*
> +	 * Justification of value bad_page_cnt_threshold in ras structure
> +	 *
> +	 * 1. -1 <= amdgpu_bad_page_threshold <= max record length in
> eeprom
> +	 * 2. if amdgpu_bad_page_threshold = -1,
> +	 *    bad_page_cnt_threshold = typical value by formula.
> +	 * 3. if amdgpu_bad_page_threshold = 0,
> +	 *    bad_page_cnt_threshold = 0xFFFFFFFF,
> +	 *    and disable RMA feature accordingly.
> +	 * 4. use the value specified from user when
> (amdgpu_bad_page_threshold
> +	 *    > 0 && < max record length in eeprom).
> +	 */
> +
> +	if (tmp_threshold < -1)
> +		tmp_threshold = -1;
> +	else if (tmp_threshold > max_length)
> +		tmp_threshold = max_length;
> +
> +	if (tmp_threshold == -1) {
> +		val = adev->gmc.mc_vram_size;
> +		do_div(val, RAS_BAD_PAGE_RATE);
> +		con->bad_page_cnt_threshold = lower_32_bits(val);
[Yang, Stanley] : It's better to compare con->bad_page_cnt_threshold with max_length,
the value of bad_page_cnt_threshold should not exceed max_length.

> +	} else if (tmp_threshold == 0) {
> +		con->bad_page_cnt_threshold = 0xFFFFFFFF;
> +	} else {
> +		con->bad_page_cnt_threshold = tmp_threshold;
> +	}
> +}
> +
>  /* called in gpu recovery/init */
>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)  { @@ -
> 1777,6 +1816,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)  {
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  	struct ras_err_handler_data **data;
> +	uint32_t max_eeprom_records_len = 0;
>  	int ret;
> 
>  	if (con)
> @@ -1795,6 +1835,9 @@ int amdgpu_ras_recovery_init(struct
> amdgpu_device *adev)
>  	atomic_set(&con->in_recovery, 0);
>  	con->adev = adev;
> 
> +	max_eeprom_records_len =
> amdgpu_ras_eeprom_get_record_max_length();
> +	amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
> +
>  	ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
>  	if (ret)
>  		goto free;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index b2667342cf67..4672649a9293 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -336,6 +336,9 @@ struct amdgpu_ras {
>  	struct amdgpu_ras_eeprom_control eeprom_control;
> 
>  	bool error_query_ready;
> +
> +	/* bad page count threshold */
> +	uint32_t bad_page_cnt_threshold;
>  };
> 
>  struct ras_fs_data {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index c0096097bbcf..a2c982b1eac6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -499,6 +499,11 @@ int amdgpu_ras_eeprom_process_recods(struct
> amdgpu_ras_eeprom_control *control,
>  	return ret == num ? 0 : -EIO;
>  }
> 
> +inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
> +{
> +	return EEPROM_MAX_RECORD_NUM;
> +}
> +
>  /* Used for testing if bugs encountered */  #if 0  void
> amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 7e8647a05df7..b272840cb069 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -85,6 +85,8 @@ int amdgpu_ras_eeprom_process_recods(struct
> amdgpu_ras_eeprom_control *control,
>  					    bool write,
>  					    int num);
> 
> +inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
> +
>  void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control
> *control);
> 
>  #endif // _AMDGPU_RAS_EEPROM_H
> --
> 2.17.1

Regards,
Stanley