[PATCH 3/5] drm/amdgpu: conduct bad gpu check during bootup/reset

Thu Jul 23 03:38:43 UTC 2020

[AMD Public Use]

Thanks for your review, Tao.
Please check my comments after yours.

Regards,
Guchun

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com> 
Sent: Thursday, July 23, 2020 10:51 AM
To: Chen, Guchun <Guchun.Chen at amd.com>; amd-gfx at lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Clements, John <John.Clements at amd.com>
Subject: RE: [PATCH 3/5] drm/amdgpu: conduct bad gpu check during bootup/reset

[AMD Official Use Only - Internal Distribution Only]

> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen at amd.com>
> Sent: Wednesday, July 22, 2020 11:14 AM
> To: amd-gfx at lists.freedesktop.org; Deucher, Alexander 
> <Alexander.Deucher at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; 
> Li, Dennis <Dennis.Li at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; 
> Zhou1, Tao <Tao.Zhou1 at amd.com>; Clements, John <John.Clements at amd.com>
> Cc: Chen, Guchun <Guchun.Chen at amd.com>
> Subject: [PATCH 3/5] drm/amdgpu: conduct bad gpu check during 
> bootup/reset
>
> During boot up, when init eeprom, RAS will check if the BAD GPU tag is 
> saved or not. And will break boot up and notify user to RMA it.
>
> At the meanwhile, when saved bad page count to eeprom exceeds 
> threshold, one ras recovery will be triggered immediately, and bad gpu 
> check will be executed and reset will fail as well to remind user.
>
> User could set bad_page_threshold = 0 as module parameter when probing 
> driver to disable the bad feature check.
>
> Signed-off-by: Guchun Chen <guchun.chen at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 21 ++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 25 +++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       | 16 +++-
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 87 ++++++++++++++++++-
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  6 +-
>  5 files changed, 142 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2662cd7c8685..d529bf7917f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2065,7 +2065,9 @@ static int amdgpu_device_ip_init(struct 
> amdgpu_device *adev)
>   * Note: theoretically, this should be called before all vram allocations
>   * to protect retired page from abusing
>   */

[Tao] The comment above should be also updated
[Guchun]yeah, will update it later.

> -amdgpu_ras_recovery_init(adev);
> +r = amdgpu_ras_recovery_init(adev);
> +if (r)
> +goto init_failed;

[Tao] Are you sure "r != 0" equals to "bad gpu"? Other errors of recovery_init should not block gpu init.
[Guchun]Good point. This should be addressed carefully.
>
>  if (adev->gmc.xgmi.num_physical_nodes > 1)  
> amdgpu_xgmi_add_device(adev); @@ -4133,8 +4135,20 @@ static int 
> amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>
>  amdgpu_fbdev_set_suspend(tmp_adev, 0);
>
> -/* must succeed. */
> -amdgpu_ras_resume(tmp_adev);
> +/*
> + * The CPU is BAD once faulty pages by ECC has
> + * reached the threshold, and ras recovery is
> + * scheduled. So add one check here to prevent
> + * recovery if it's one BAD GPU, and remind
> + * user to RMA this GPU.
> + */
> +if (!amdgpu_ras_check_bad_gpu(tmp_adev)) {
> +/* must succeed. */
> +amdgpu_ras_resume(tmp_adev);
> +} else {
> +r = -EINVAL;
> +goto out;
> +}
>
>  /* Update PSP FW topology after reset */  if (hive && tmp_adev-
> >gmc.xgmi.num_physical_nodes > 1) @@ -4142,7 +4156,6 @@ static int
> amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,  }  }
>
> -
>  out:
>  if (!r) {
>  amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index e3d67d85c55f..818d4154e4a3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -62,8 +62,6 @@ const char *ras_block_string[] = {  #define ras_err_str(i)
> (ras_error_string[ffs(i)])  #define ras_block_str(i) (ras_block_string[i])
>
> -#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS1
> -#define AMDGPU_RAS_FLAG_INIT_NEED_RESET2
>  #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
>
>  /* inject address is 52 bits */
> @@ -1817,6 +1815,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)
>  struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  struct ras_err_handler_data **data;
>  uint32_t max_eeprom_records_len = 0;
> +bool bad_gpu = false;
>  int ret;
>
>  if (con)
> @@ -1838,9 +1837,14 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)
>  max_eeprom_records_len =
> amdgpu_ras_eeprom_get_record_max_length();
>  amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
>
> -ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
> -if (ret)
> +ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &bad_gpu);
> +/*
> + * we only fail this calling and halt booting when bad_gpu is true.
> + */
> +if (bad_gpu) {
> +ret = -EINVAL;
>  goto free;
> +}
>
>  if (con->eeprom_control.num_recs) {
>  ret = amdgpu_ras_load_bad_pages(adev); @@ -2189,3
> +2193,16 @@ bool amdgpu_ras_need_emergency_restart(struct
> amdgpu_device *adev)
>
>  return false;
>  }
> +
> +bool amdgpu_ras_check_bad_gpu(struct amdgpu_device *adev) {
> +struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +bool bad_gpu = false;
> +
> +if (con && (con->bad_page_cnt_threshold != 0xFFFFFFFF))
> +amdgpu_ras_eeprom_check_bad_gpu(&con->eeprom_control,
> +&bad_gpu);
> +
> +/* We are only interested in variable bad_gpu. */
> +return bad_gpu;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 4672649a9293..d7a363b37feb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -31,6 +31,10 @@
>  #include "ta_ras_if.h"
>  #include "amdgpu_ras_eeprom.h"
>
> +#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS(0x1 << 0)
> +#define AMDGPU_RAS_FLAG_INIT_NEED_RESET(0x1 << 1)
> +#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV(0x1 << 2)
> +
>  enum amdgpu_ras_block {
>  AMDGPU_RAS_BLOCK__UMC = 0,
>  AMDGPU_RAS_BLOCK__SDMA,
> @@ -493,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device
> *adev);  unsigned long amdgpu_ras_query_error_count(struct amdgpu_device
> *adev,
>  bool is_ce);
>
> +bool amdgpu_ras_check_bad_gpu(struct amdgpu_device *adev);
> +
>  /* error handling functions */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>  struct eeprom_table_record *bps, int pages); @@ -503,10
> +509,14 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device
> *adev)  {
>  struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>
> -/* save bad page to eeprom before gpu reset,
> - * i2c may be unstable in gpu reset
> +/*
> + * Save bad page to eeprom before gpu reset, i2c may be unstable
> + * in gpu reset.
> + *
> + * Also, exclude the case when ras recovery issuer is
> + * eerprom page write.
>   */
> -if (in_task())
> +if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) &&
> in_task())
>  amdgpu_ras_reserve_bad_pages(adev);
>
>  if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index a2c982b1eac6..96b63c026bad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

[Tao] It's better to split it into two patches, one is for ras and another one is for eeprom
[Guchun]I tried this, however, even if for ras change, eeprom change is needed as well. So I merged them both into one patch.

> @@ -46,6 +46,9 @@
>  #define EEPROM_TABLE_HDR_VAL 0x414d4452  #define EEPROM_TABLE_VER
> 0x00010000
>
> +/* Bad GPU tag ‘BADG’ */
> +#define EEPROM_TABLE_HDR_BAD 0x42414447
> +
>  /* Assume 2 Mbit size */
>  #define EEPROM_SIZE_BYTES 256000
>  #define EEPROM_PAGE__SIZE_BYTES 256
> @@ -238,12 +241,14 @@ int amdgpu_ras_eeprom_reset_table(struct
> amdgpu_ras_eeprom_control *control)
>
>  }
>
> -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
> +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> +bool *bad_gpu)
>  {
>  int ret = 0;
>  struct amdgpu_device *adev = to_amdgpu_device(control);
>  unsigned char buff[EEPROM_ADDRESS_SIZE +
> EEPROM_TABLE_HEADER_SIZE] = { 0 };
>  struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
> +struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>  struct i2c_msg msg = {
>  .addr= 0,
>  .flags= I2C_M_RD,
> @@ -251,6 +256,8 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control)
>  .buf= buff,
>  };
>
> +*bad_gpu = false;
> +
>  /* Verify i2c adapter is initialized */
>  if (!adev->pm.smu_i2c.algo)
>  return -ENOENT;
> @@ -279,6 +286,11 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control)
>  DRM_DEBUG_DRIVER("Found existing EEPROM table with %d
> records",
>   control->num_recs);
>
> +} else if ((ras->bad_page_cnt_threshold != 0xFFFFFFFF) && (
> +hdr->header == EEPROM_TABLE_HDR_BAD)) {
> +*bad_gpu = true;
> +DRM_ERROR("Detect BAD GPU TAG in eeprom table and "
> +"GPU shall be RMAed.\n");
>  } else {
>  DRM_INFO("Creating new EEPROM table");
>
> @@ -375,6 +387,44 @@ static uint32_t
> __correct_eeprom_dest_address(uint32_t curr_address)
>  return curr_address;
>  }
>
> +int amdgpu_ras_eeprom_check_bad_gpu(struct amdgpu_ras_eeprom_control
> *control,
> +bool *bad_gpu)
> +{
> +struct amdgpu_device *adev = to_amdgpu_device(control);
> +unsigned char buff[EEPROM_ADDRESS_SIZE +
> +EEPROM_TABLE_HEADER_SIZE] = { 0 };
> +struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
> +struct i2c_msg msg = {
> +.addr = control->i2c_address,
> +.flags = I2C_M_RD,
> +.len = EEPROM_ADDRESS_SIZE +
> EEPROM_TABLE_HEADER_SIZE,
> +.buf = buff,
> +};
> +int ret;
> +
> +*bad_gpu = false;
> +
> +/* read EEPROM table header */
> +mutex_lock(&control->tbl_mutex);
> +ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
> +if (ret < 1) {
> +dev_err(adev->dev, "Failed to read EEPROM table header.\n");
> +goto err;

[Tao] One tab is missed
[Guchun]Correct this later.

> +}
> +
> +__decode_table_header_from_buff(hdr, &buff[2]);
> +
> +if (hdr->header == EEPROM_TABLE_HDR_BAD) {
> +dev_warn(adev->dev, "Current GPU is BAD and should be
> RMAed.\n");
> +*bad_gpu = true;
> +}
> +
> +ret = 0;
> +err:
> +mutex_unlock(&control->tbl_mutex);
> +return ret;
> +}
> +
>  int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control
> *control,
>      struct eeprom_table_record
> *records,
>      bool write,
> @@ -383,8 +433,10 @@ int amdgpu_ras_eeprom_process_recods(struct
> amdgpu_ras_eeprom_control *control,
>  int i, ret = 0;
>  struct i2c_msg *msgs, *msg;
>  unsigned char *buffs, *buff;
> +bool sched_ras_recovery = false;
>  struct eeprom_table_record *record;
>  struct amdgpu_device *adev = to_amdgpu_device(control);
> +struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>
>  if (adev->asic_type != CHIP_VEGA20 && adev->asic_type !=
> CHIP_ARCTURUS)
>  return 0;
> @@ -402,6 +454,25 @@ int amdgpu_ras_eeprom_process_recods(struct
> amdgpu_ras_eeprom_control *control,
>  goto free_buff;
>  }
>
> +/*
> + * If saved bad pages number exceeds the bad page threshod for
> + * the whole VRAM, update table header to mark one BAD GPU and
> + * schedule one ras recovery after eeprom write is done, this
> + * can avoid the missing for latest records.
> + *
> + * This new header will be picked up and checked in the bootup by
> + * ras recovery, which may break bootup process to notify user this
> + * GPU is bad and to RMA(Return Merchandise Authorization) this GPU.
> + */
> +if (write && (ras->bad_page_cnt_threshold != 0xFFFFFFFF) &&
> +((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
> +dev_warn(adev->dev,
> +"Saved bad pages(%d) reaches threshold value(%d).\n",
> +control->num_recs + num, ras-
> >bad_page_cnt_threshold);
> +control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
> +sched_ras_recovery = true;
> +}
> +
>  /* In case of overflow just start from beginning to not lose newest
> records */
>  if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE *
> num > EEPROM_SIZE_BYTES))
>  control->next_addr = EEPROM_RECORD_START; @@ -482,6
> +553,20 @@ int amdgpu_ras_eeprom_process_recods(struct
> amdgpu_ras_eeprom_control *control,
>  __update_tbl_checksum(control, records, num,
> old_hdr_byte_sum);
>
>  __update_table_header(control, buffs);
> +
> +if (sched_ras_recovery) {
> +/*
> + * Before scheduling ras recovery, assert the related
> + * flag first, which shall bypass common bad page
> + * reservation execution in amdgpu_ras_reset_gpu.
> + */
> +amdgpu_ras_get_context(adev)->flags |=
> +AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;
> +
> +dev_warn(adev->dev, "Conduct ras recovery due to bad
> "
> +"page threshold reached.\n");
> +amdgpu_ras_reset_gpu(adev);
> +}
>  } else if (!__validate_tbl_checksum(control, records, num)) {
>  DRM_WARN("EEPROM Table checksum mismatch!");
>  /* TODO Uncomment when EEPROM read/write is relliable */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index b272840cb069..4ccd139248a9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -77,9 +77,13 @@ struct eeprom_table_record {
>  unsigned char mcumc_id;
>  }__attribute__((__packed__));
>
> -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
> +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> +bool *bad_gpu);
>  int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control
> *control);
>
> +int amdgpu_ras_eeprom_check_bad_gpu(struct amdgpu_ras_eeprom_control
> *control,
> +bool *bad_gpu);
> +
>  int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control
> *control,
>      struct eeprom_table_record
> *records,
>      bool write,
> --
> 2.17.1