[PATCH 36/40] drm/amdgpu: Use explicit cardinality for clarity
Alex Deucher
alexdeucher at gmail.com
Thu Jun 10 21:17:40 UTC 2021
On Tue, Jun 8, 2021 at 5:41 PM Luben Tuikov <luben.tuikov at amd.com> wrote:
>
> RAS_MAX_RECORD_NUM may mean the maximum record
> number, as in the maximum house number on your
> street, or it may mean the maximum number of
> records, as in the count of records, which is also
> a number. To make this distinction whether the
> number is ordinal (index) or cardinal (count),
> rename this macro to RAS_MAX_RECORD_COUNT.
>
> This makes it easy to understand what it refers
> to, especially when we compute quantities such as,
> how many records do we have left in the table,
> especially when there are so many other numbers,
> quantities and numerical macros around.
>
> Also rename the long,
> amdgpu_ras_eeprom_get_record_max_length() to the
> more succinct and clear,
> amdgpu_ras_eeprom_max_record_count().
>
> When computing the threshold, which also deals
> with counts, i.e. "how many", use cardinal
> "max_eeprom_records_count", than the quantitative
> "max_eeprom_records_len".
>
> Simplify the logic here and there, as well.
>
> Cc: Guchun Chen <guchun.chen at amd.com>
> Cc: John Clements <john.clements at amd.com>
> Cc: Hawking Zhang <Hawking.Zhang at amd.com>
> Cc: Alexander Deucher <Alexander.Deucher at amd.com>
> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
Acked-by: Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 ++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 50 ++++++++-----------
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 8 +--
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 +-
> 4 files changed, 30 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 3de1accb060e37..0203f654576bcc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -853,11 +853,10 @@ MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legac
> module_param_named(reset_method, amdgpu_reset_method, int, 0444);
>
> /**
> - * DOC: bad_page_threshold (int)
> - * Bad page threshold is to specify the threshold value of faulty pages
> - * detected by RAS ECC, that may result in GPU entering bad status if total
> - * faulty pages by ECC exceed threshold value and leave it for user's further
> - * check.
> + * DOC: bad_page_threshold (int) Bad page threshold is specifies the
> + * threshold value of faulty pages detected by RAS ECC, which may
> + * result in the GPU entering bad status when the number of total
> + * faulty pages by ECC exceeds the threshold value.
> */
> MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
> module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 66c96c65e7eeb9..95ab400b641af0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -71,8 +71,8 @@ const char *ras_block_string[] = {
> /* inject address is 52 bits */
> #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
>
> -/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
> -#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL)
> +/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
> +#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
>
> enum amdgpu_ras_retire_page_reservation {
> AMDGPU_RAS_RETIRE_PAGE_RESERVED,
> @@ -1841,27 +1841,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
> static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
> {
> struct amdgpu_ras_eeprom_control *control =
> - &adev->psp.ras.ras->eeprom_control;
> - struct eeprom_table_record *bps = NULL;
> - int ret = 0;
> + &adev->psp.ras.ras->eeprom_control;
> + struct eeprom_table_record *bps;
> + int ret;
>
> /* no bad page record, skip eeprom access */
> - if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
> - return ret;
> + if (control->num_recs == 0 || amdgpu_bad_page_threshold == 0)
> + return 0;
>
> bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
> if (!bps)
> return -ENOMEM;
>
> - if (amdgpu_ras_eeprom_read(control, bps, control->num_recs)) {
> + ret = amdgpu_ras_eeprom_read(control, bps, control->num_recs);
> + if (ret)
> dev_err(adev->dev, "Failed to load EEPROM table records!");
> - ret = -EIO;
> - goto out;
> - }
> -
> - ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
> + else
> + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
>
> -out:
> kfree(bps);
> return ret;
> }
> @@ -1901,11 +1898,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
> }
>
> static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
> - uint32_t max_length)
> + uint32_t max_count)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> - int tmp_threshold = amdgpu_bad_page_threshold;
> - u64 val;
>
> /*
> * Justification of value bad_page_cnt_threshold in ras structure
> @@ -1926,18 +1921,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
> * take no effect.
> */
>
> - if (tmp_threshold < -1)
> - tmp_threshold = -1;
> - else if (tmp_threshold > max_length)
> - tmp_threshold = max_length;
> + if (amdgpu_bad_page_threshold < 0) {
> + u64 val = adev->gmc.mc_vram_size;
>
> - if (tmp_threshold == -1) {
> - val = adev->gmc.mc_vram_size;
> - do_div(val, RAS_BAD_PAGE_RATE);
> + do_div(val, RAS_BAD_PAGE_COVER);
> con->bad_page_cnt_threshold = min(lower_32_bits(val),
> - max_length);
> + max_count);
> } else {
> - con->bad_page_cnt_threshold = tmp_threshold;
> + con->bad_page_cnt_threshold = min_t(int, max_count,
> + amdgpu_bad_page_threshold);
> }
> }
>
> @@ -1945,7 +1937,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct ras_err_handler_data **data;
> - uint32_t max_eeprom_records_len = 0;
> + u32 max_eeprom_records_count = 0;
> bool exc_err_limit = false;
> int ret;
>
> @@ -1965,8 +1957,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> atomic_set(&con->in_recovery, 0);
> con->adev = adev;
>
> - max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
> - amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
> + max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
> + amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
>
> /* Todo: During test the SMU might fail to read the eeprom through I2C
> * when the GPU is pending on XGMI reset during probe time
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 54ef31594accd9..21e1e59e4857ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -54,7 +54,7 @@
> #define RAS_TBL_SIZE_BYTES (256 * 1024)
> #define RAS_HDR_START 0
> #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
> -#define RAS_MAX_RECORD_NUM ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
> +#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
> / RAS_TABLE_RECORD_SIZE)
>
> #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
> @@ -532,7 +532,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control,
> * TODO - Check the assumption is correct
> */
> control->num_recs += num;
> - control->num_recs %= RAS_MAX_RECORD_NUM;
> + control->num_recs %= RAS_MAX_RECORD_COUNT;
> control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num;
> if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES)
> control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
> @@ -568,9 +568,9 @@ int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
> return amdgpu_ras_eeprom_xfer(control, records, num, true);
> }
>
> -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
> +inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
> {
> - return RAS_MAX_RECORD_NUM;
> + return RAS_MAX_RECORD_COUNT;
> }
>
> /* Used for testing if bugs encountered */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 4906ed9fb8cdd3..504729b8053759 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -88,7 +88,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
> int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
> struct eeprom_table_record *records, const u32 num);
>
> -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
> +inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
>
> void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control);
>
> --
> 2.32.0
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list