[PATCH 3/5] drm/amdgpu: conduct bad gpu check during bootup/reset
Chen, Guchun
Guchun.Chen at amd.com
Thu Jul 23 03:38:43 UTC 2020
[AMD Public Use]
Thanks for your review, Tao.
Please check my comments after yours.
Regards,
Guchun
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Thursday, July 23, 2020 10:51 AM
To: Chen, Guchun <Guchun.Chen at amd.com>; amd-gfx at lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Clements, John <John.Clements at amd.com>
Subject: RE: [PATCH 3/5] drm/amdgpu: conduct bad gpu check during bootup/reset
[AMD Official Use Only - Internal Distribution Only]
> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen at amd.com>
> Sent: Wednesday, July 22, 2020 11:14 AM
> To: amd-gfx at lists.freedesktop.org; Deucher, Alexander
> <Alexander.Deucher at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>;
> Li, Dennis <Dennis.Li at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>;
> Zhou1, Tao <Tao.Zhou1 at amd.com>; Clements, John <John.Clements at amd.com>
> Cc: Chen, Guchun <Guchun.Chen at amd.com>
> Subject: [PATCH 3/5] drm/amdgpu: conduct bad gpu check during
> bootup/reset
>
> During boot up, when init eeprom, RAS will check if the BAD GPU tag is
> saved or not. And will break boot up and notify user to RMA it.
>
> At the meanwhile, when saved bad page count to eeprom exceeds
> threshold, one ras recovery will be triggered immediately, and bad gpu
> check will be executed and reset will fail as well to remind user.
>
> User could set bad_page_threshold = 0 as module parameter when probing
> driver to disable the bad feature check.
>
> Signed-off-by: Guchun Chen <guchun.chen at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 25 +++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 16 +++-
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 87 ++++++++++++++++++-
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 6 +-
> 5 files changed, 142 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2662cd7c8685..d529bf7917f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2065,7 +2065,9 @@ static int amdgpu_device_ip_init(struct
> amdgpu_device *adev)
> * Note: theoretically, this should be called before all vram allocations
> * to protect retired page from abusing
> */
[Tao] The comment above should be also updated
[Guchun]yeah, will update it later.
> -amdgpu_ras_recovery_init(adev);
> +r = amdgpu_ras_recovery_init(adev);
> +if (r)
> +goto init_failed;
[Tao] Are you sure "r != 0" equals to "bad gpu"? Other errors of recovery_init should not block gpu init.
[Guchun]Good point. This should be addressed carefully.
>
> if (adev->gmc.xgmi.num_physical_nodes > 1)
> amdgpu_xgmi_add_device(adev); @@ -4133,8 +4135,20 @@ static int
> amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>
> amdgpu_fbdev_set_suspend(tmp_adev, 0);
>
> -/* must succeed. */
> -amdgpu_ras_resume(tmp_adev);
> +/*
> + * The CPU is BAD once faulty pages by ECC has
> + * reached the threshold, and ras recovery is
> + * scheduled. So add one check here to prevent
> + * recovery if it's one BAD GPU, and remind
> + * user to RMA this GPU.
> + */
> +if (!amdgpu_ras_check_bad_gpu(tmp_adev)) {
> +/* must succeed. */
> +amdgpu_ras_resume(tmp_adev);
> +} else {
> +r = -EINVAL;
> +goto out;
> +}
>
> /* Update PSP FW topology after reset */ if (hive && tmp_adev-
> >gmc.xgmi.num_physical_nodes > 1) @@ -4142,7 +4156,6 @@ static int
> amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } }
>
> -
> out:
> if (!r) {
> amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index e3d67d85c55f..818d4154e4a3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -62,8 +62,6 @@ const char *ras_block_string[] = { #define ras_err_str(i)
> (ras_error_string[ffs(i)]) #define ras_block_str(i) (ras_block_string[i])
>
> -#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS1
> -#define AMDGPU_RAS_FLAG_INIT_NEED_RESET2
> #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
>
> /* inject address is 52 bits */
> @@ -1817,6 +1815,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct ras_err_handler_data **data;
> uint32_t max_eeprom_records_len = 0;
> +bool bad_gpu = false;
> int ret;
>
> if (con)
> @@ -1838,9 +1837,14 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)
> max_eeprom_records_len =
> amdgpu_ras_eeprom_get_record_max_length();
> amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
>
> -ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
> -if (ret)
> +ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &bad_gpu);
> +/*
> + * we only fail this calling and halt booting when bad_gpu is true.
> + */
> +if (bad_gpu) {
> +ret = -EINVAL;
> goto free;
> +}
>
> if (con->eeprom_control.num_recs) {
> ret = amdgpu_ras_load_bad_pages(adev); @@ -2189,3
> +2193,16 @@ bool amdgpu_ras_need_emergency_restart(struct
> amdgpu_device *adev)
>
> return false;
> }
> +
> +bool amdgpu_ras_check_bad_gpu(struct amdgpu_device *adev) {
> +struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +bool bad_gpu = false;
> +
> +if (con && (con->bad_page_cnt_threshold != 0xFFFFFFFF))
> +amdgpu_ras_eeprom_check_bad_gpu(&con->eeprom_control,
> +&bad_gpu);
> +
> +/* We are only interested in variable bad_gpu. */
> +return bad_gpu;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 4672649a9293..d7a363b37feb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -31,6 +31,10 @@
> #include "ta_ras_if.h"
> #include "amdgpu_ras_eeprom.h"
>
> +#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS(0x1 << 0)
> +#define AMDGPU_RAS_FLAG_INIT_NEED_RESET(0x1 << 1)
> +#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV(0x1 << 2)
> +
> enum amdgpu_ras_block {
> AMDGPU_RAS_BLOCK__UMC = 0,
> AMDGPU_RAS_BLOCK__SDMA,
> @@ -493,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device
> *adev); unsigned long amdgpu_ras_query_error_count(struct amdgpu_device
> *adev,
> bool is_ce);
>
> +bool amdgpu_ras_check_bad_gpu(struct amdgpu_device *adev);
> +
> /* error handling functions */
> int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> struct eeprom_table_record *bps, int pages); @@ -503,10
> +509,14 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device
> *adev) {
> struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>
> -/* save bad page to eeprom before gpu reset,
> - * i2c may be unstable in gpu reset
> +/*
> + * Save bad page to eeprom before gpu reset, i2c may be unstable
> + * in gpu reset.
> + *
> + * Also, exclude the case when ras recovery issuer is
> + * eerprom page write.
> */
> -if (in_task())
> +if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) &&
> in_task())
> amdgpu_ras_reserve_bad_pages(adev);
>
> if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index a2c982b1eac6..96b63c026bad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
[Tao] It's better to split it into two patches, one is for ras and another one is for eeprom
[Guchun]I tried this, however, even if for ras change, eeprom change is needed as well. So I merged them both into one patch.
> @@ -46,6 +46,9 @@
> #define EEPROM_TABLE_HDR_VAL 0x414d4452 #define EEPROM_TABLE_VER
> 0x00010000
>
> +/* Bad GPU tag ‘BADG’ */
> +#define EEPROM_TABLE_HDR_BAD 0x42414447
> +
> /* Assume 2 Mbit size */
> #define EEPROM_SIZE_BYTES 256000
> #define EEPROM_PAGE__SIZE_BYTES 256
> @@ -238,12 +241,14 @@ int amdgpu_ras_eeprom_reset_table(struct
> amdgpu_ras_eeprom_control *control)
>
> }
>
> -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
> +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> +bool *bad_gpu)
> {
> int ret = 0;
> struct amdgpu_device *adev = to_amdgpu_device(control);
> unsigned char buff[EEPROM_ADDRESS_SIZE +
> EEPROM_TABLE_HEADER_SIZE] = { 0 };
> struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
> +struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> struct i2c_msg msg = {
> .addr= 0,
> .flags= I2C_M_RD,
> @@ -251,6 +256,8 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control)
> .buf= buff,
> };
>
> +*bad_gpu = false;
> +
> /* Verify i2c adapter is initialized */
> if (!adev->pm.smu_i2c.algo)
> return -ENOENT;
> @@ -279,6 +286,11 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control)
> DRM_DEBUG_DRIVER("Found existing EEPROM table with %d
> records",
> control->num_recs);
>
> +} else if ((ras->bad_page_cnt_threshold != 0xFFFFFFFF) && (
> +hdr->header == EEPROM_TABLE_HDR_BAD)) {
> +*bad_gpu = true;
> +DRM_ERROR("Detect BAD GPU TAG in eeprom table and "
> +"GPU shall be RMAed.\n");
> } else {
> DRM_INFO("Creating new EEPROM table");
>
> @@ -375,6 +387,44 @@ static uint32_t
> __correct_eeprom_dest_address(uint32_t curr_address)
> return curr_address;
> }
>
> +int amdgpu_ras_eeprom_check_bad_gpu(struct amdgpu_ras_eeprom_control
> *control,
> +bool *bad_gpu)
> +{
> +struct amdgpu_device *adev = to_amdgpu_device(control);
> +unsigned char buff[EEPROM_ADDRESS_SIZE +
> +EEPROM_TABLE_HEADER_SIZE] = { 0 };
> +struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
> +struct i2c_msg msg = {
> +.addr = control->i2c_address,
> +.flags = I2C_M_RD,
> +.len = EEPROM_ADDRESS_SIZE +
> EEPROM_TABLE_HEADER_SIZE,
> +.buf = buff,
> +};
> +int ret;
> +
> +*bad_gpu = false;
> +
> +/* read EEPROM table header */
> +mutex_lock(&control->tbl_mutex);
> +ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
> +if (ret < 1) {
> +dev_err(adev->dev, "Failed to read EEPROM table header.\n");
> +goto err;
[Tao] One tab is missed
[Guchun]Correct this later.
> +}
> +
> +__decode_table_header_from_buff(hdr, &buff[2]);
> +
> +if (hdr->header == EEPROM_TABLE_HDR_BAD) {
> +dev_warn(adev->dev, "Current GPU is BAD and should be
> RMAed.\n");
> +*bad_gpu = true;
> +}
> +
> +ret = 0;
> +err:
> +mutex_unlock(&control->tbl_mutex);
> +return ret;
> +}
> +
> int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control
> *control,
> struct eeprom_table_record
> *records,
> bool write,
> @@ -383,8 +433,10 @@ int amdgpu_ras_eeprom_process_recods(struct
> amdgpu_ras_eeprom_control *control,
> int i, ret = 0;
> struct i2c_msg *msgs, *msg;
> unsigned char *buffs, *buff;
> +bool sched_ras_recovery = false;
> struct eeprom_table_record *record;
> struct amdgpu_device *adev = to_amdgpu_device(control);
> +struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>
> if (adev->asic_type != CHIP_VEGA20 && adev->asic_type !=
> CHIP_ARCTURUS)
> return 0;
> @@ -402,6 +454,25 @@ int amdgpu_ras_eeprom_process_recods(struct
> amdgpu_ras_eeprom_control *control,
> goto free_buff;
> }
>
> +/*
> + * If saved bad pages number exceeds the bad page threshod for
> + * the whole VRAM, update table header to mark one BAD GPU and
> + * schedule one ras recovery after eeprom write is done, this
> + * can avoid the missing for latest records.
> + *
> + * This new header will be picked up and checked in the bootup by
> + * ras recovery, which may break bootup process to notify user this
> + * GPU is bad and to RMA(Return Merchandise Authorization) this GPU.
> + */
> +if (write && (ras->bad_page_cnt_threshold != 0xFFFFFFFF) &&
> +((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
> +dev_warn(adev->dev,
> +"Saved bad pages(%d) reaches threshold value(%d).\n",
> +control->num_recs + num, ras-
> >bad_page_cnt_threshold);
> +control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
> +sched_ras_recovery = true;
> +}
> +
> /* In case of overflow just start from beginning to not lose newest
> records */
> if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE *
> num > EEPROM_SIZE_BYTES))
> control->next_addr = EEPROM_RECORD_START; @@ -482,6
> +553,20 @@ int amdgpu_ras_eeprom_process_recods(struct
> amdgpu_ras_eeprom_control *control,
> __update_tbl_checksum(control, records, num,
> old_hdr_byte_sum);
>
> __update_table_header(control, buffs);
> +
> +if (sched_ras_recovery) {
> +/*
> + * Before scheduling ras recovery, assert the related
> + * flag first, which shall bypass common bad page
> + * reservation execution in amdgpu_ras_reset_gpu.
> + */
> +amdgpu_ras_get_context(adev)->flags |=
> +AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;
> +
> +dev_warn(adev->dev, "Conduct ras recovery due to bad
> "
> +"page threshold reached.\n");
> +amdgpu_ras_reset_gpu(adev);
> +}
> } else if (!__validate_tbl_checksum(control, records, num)) {
> DRM_WARN("EEPROM Table checksum mismatch!");
> /* TODO Uncomment when EEPROM read/write is relliable */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index b272840cb069..4ccd139248a9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -77,9 +77,13 @@ struct eeprom_table_record {
> unsigned char mcumc_id;
> }__attribute__((__packed__));
>
> -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
> +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> +bool *bad_gpu);
> int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control
> *control);
>
> +int amdgpu_ras_eeprom_check_bad_gpu(struct amdgpu_ras_eeprom_control
> *control,
> +bool *bad_gpu);
> +
> int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control
> *control,
> struct eeprom_table_record
> *records,
> bool write,
> --
> 2.17.1
More information about the amd-gfx
mailing list