[PATCH 2/3] drm/amdgpu: Refine bad page adding
Zhou1, Tao
Tao.Zhou1 at amd.com
Fri Feb 21 10:18:45 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: Xie, Patrick <Gangliang.Xie at amd.com>
> Sent: Friday, February 21, 2025 11:19 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Xie, Patrick <Gangliang.Xie at amd.com>
> Subject: [PATCH 2/3] drm/amdgpu: Refine bad page adding
>
> bad page adding can be simpler with nps info
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 196 +++++++++++++-----------
> 1 file changed, 105 insertions(+), 91 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5420e2d6d244..439841a2d1c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2801,20 +2801,101 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device
> *adev,
> return -EINVAL;
> }
>
> +static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
> + struct eeprom_table_record *bps, int count) {
> + int j;
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_err_handler_data *data = con->eh_data;
> +
> + for (j = 0; j < count; j++) {
> + if (amdgpu_ras_check_bad_page_unlock(con,
> + bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
> + continue;
> +
> + if (!data->space_left &&
> + amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
[Tao] space should be replaced with tab
> + return -ENOMEM;
> + }
> +
> + amdgpu_ras_reserve_page(adev, bps[j].retired_page);
> +
> + memcpy(&data->bps[data->count], &(bps[j]),
> + sizeof(struct eeprom_table_record));
> + data->count++;
> + data->space_left--;
> + }
> +
> + return 0;
> +}
> +
> +static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device
> *adev,
> + struct eeprom_table_record *bps, struct ras_err_data
> *err_data,
> + enum amdgpu_memory_partition nps)
> +{
> + int i = 0;
> + int ret = 0;
> + enum amdgpu_memory_partition save_nps;
> +
> + save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) &
> UMC_NPS_MASK;
> +
> + for (i = 0; i < adev->umc.retire_unit; i++)
> + bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
> +
> + if (save_nps) {
> + if (save_nps == nps) {
> + if (amdgpu_umc_pages_in_a_row(adev, err_data,
> + bps[0].retired_page <<
> AMDGPU_GPU_PAGE_SHIFT))
> + return -EINVAL;
> + } else {
> + if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
> + return -EINVAL;
> + }
> + } else {
> + if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
> + if (nps == AMDGPU_NPS1_PARTITION_MODE)
> + memcpy(err_data->err_addr, bps,
> + sizeof(struct eeprom_table_record) * adev-
> >umc.retire_unit);
> + else
> + return -EOPNOTSUPP;
> + }
> + }
> +
> + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
> +adev->umc.retire_unit); }
> +
> +static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
> + struct eeprom_table_record *bps, struct ras_err_data
> *err_data,
> + enum amdgpu_memory_partition nps)
> +{
> + enum amdgpu_memory_partition save_nps;
> +
> + save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
> + bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
> +
> + if (save_nps == nps) {
> + if (amdgpu_umc_pages_in_a_row(adev, err_data,
> + bps->retired_page <<
> AMDGPU_GPU_PAGE_SHIFT))
> + return -EINVAL;
> + } else {
> + if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
> + return -EINVAL;
> + }
> + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
> + adev-
> >umc.retire_unit);
> +}
> +
> /* it deal with vram only. */
> int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> struct eeprom_table_record *bps, int pages, bool from_rom) {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> - struct ras_err_handler_data *data;
> struct ras_err_data err_data;
> - struct eeprom_table_record *err_rec;
> struct amdgpu_ras_eeprom_control *control =
> &adev->psp.ras_context.ras->eeprom_control;
> enum amdgpu_memory_partition nps =
> AMDGPU_NPS1_PARTITION_MODE;
> int ret = 0;
> - uint32_t i, j, loop_cnt = 1;
> - bool find_pages_per_pa = false;
> + uint32_t i;
>
> if (!con || !con->eh_data || !bps || pages <= 0)
> return 0;
> @@ -2825,108 +2906,41 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
> sizeof(struct eeprom_table_record), GFP_KERNEL);
> if (!err_data.err_addr) {
> dev_warn(adev->dev, "Failed to alloc UMC error address
> record in mca2pa conversion!\n");
> - ret = -ENOMEM;
> - goto out;
> + return -ENOMEM;
> }
>
> - err_rec = err_data.err_addr;
> - loop_cnt = adev->umc.retire_unit;
> if (adev->gmc.gmc_funcs->query_mem_partition_mode)
> nps = adev->gmc.gmc_funcs-
> >query_mem_partition_mode(adev);
> }
>
> mutex_lock(&con->recovery_lock);
> - data = con->eh_data;
> - if (!data) {
> - /* Returning 0 as the absence of eh_data is acceptable */
> - goto free;
> - }
> -
> - for (i = 0; i < pages; i++) {
> - if (from_rom &&
> - control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
> - if (!find_pages_per_pa) {
> - if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i],
> &err_data)) {
> - if (!i && nps ==
> AMDGPU_NPS1_PARTITION_MODE) {
> - /* may use old RAS TA, use PA to find
> pages in
> - * one row
> - */
> - if
> (amdgpu_umc_pages_in_a_row(adev, &err_data,
> -
> bps[i].retired_page <<
> -
> AMDGPU_GPU_PAGE_SHIFT)) {
> - ret = -EINVAL;
> - goto free;
> - } else {
> - find_pages_per_pa = true;
> - }
> - } else {
> - /* unsupported cases */
> - ret = -EOPNOTSUPP;
> - goto free;
> - }
> - }
> - } else {
> - if (amdgpu_umc_pages_in_a_row(adev, &err_data,
> - bps[i].retired_page <<
> AMDGPU_GPU_PAGE_SHIFT)) {
> - ret = -EINVAL;
> - goto free;
> - }
> - }
> - } else {
> - if (from_rom && !find_pages_per_pa) {
> - if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
> - /* bad page in any NPS mode in eeprom */
> - if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i],
> &err_data)) {
> - ret = -EINVAL;
> +
> + if (from_rom) {
> + for (i = 0; i < pages; i++) {
> + if (control->ras_num_recs - i >= adev->umc.retire_unit) {
> + if ((bps[i].address == bps[i + 1].address) &&
> + (bps[i].mem_channel == bps[i + 1].mem_channel)) {
> + //deal with retire_unit records a time
> + ret =
> __amdgpu_ras_convert_rec_array_from_rom(adev,
> + &bps[i],
> &err_data, nps);
> + if (ret)
> goto free;
> - }
> + i += (adev->umc.retire_unit - 1);
> } else {
> - /* legacy bad page in eeprom, generated only
> in
> - * NPS1 mode
> - */
> - if (amdgpu_ras_mca2pa(adev, &bps[i],
> &err_data)) {
> - /* old RAS TA or ASICs which don't
> support to
> - * convert addrss via mca address
> - */
> - if (!i && nps ==
> AMDGPU_NPS1_PARTITION_MODE) {
> - find_pages_per_pa = true;
> - err_rec = &bps[i];
> - loop_cnt = 1;
> - } else {
> - /* non-nps1 mode, old RAS TA
> - * can't support it
> - */
> - ret = -EOPNOTSUPP;
> - goto free;
> - }
> - }
> + break;
> }
> -
> - if (!find_pages_per_pa)
> - i += (adev->umc.retire_unit - 1);
> } else {
> - err_rec = &bps[i];
> + break;
> }
> }
> -
> - for (j = 0; j < loop_cnt; j++) {
> - if (amdgpu_ras_check_bad_page_unlock(con,
> - err_rec[j].retired_page <<
> AMDGPU_GPU_PAGE_SHIFT))
> - continue;
> -
> - if (!data->space_left &&
> - amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
> - ret = -ENOMEM;
> + for (; i < pages; i++) {
> + ret = __amdgpu_ras_convert_rec_from_rom(adev,
> + &bps[i], &err_data, nps);
> + if (ret)
> goto free;
> - }
> -
> - amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
> -
> - memcpy(&data->bps[data->count], &(err_rec[j]),
> - sizeof(struct eeprom_table_record));
> - data->count++;
> - data->space_left--;
> }
> + } else {
> + ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
> }
>
> free:
> --
> 2.34.1
More information about the amd-gfx
mailing list