[PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
Zhou1, Tao
Tao.Zhou1 at amd.com
Mon Sep 2 03:00:24 UTC 2019
> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen at amd.com>
> Sent: 2019年9月2日 10:11
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org;
> Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Li, Dennis
> <Dennis.Li at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
>
>
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Friday, August 30, 2019 8:25 PM
> To: amd-gfx at lists.freedesktop.org; Grodzovsky, Andrey
> <Andrey.Grodzovsky at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>;
> Li, Dennis <Dennis.Li at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
>
> support eeprom records load and save for ras, move EEPROM records
> storing to bad page reserving
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++--
> ----
> 1 file changed, 83 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 24663ec41248..02120aa3cb5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
> return ret;
> }
>
> +/*
> + * write error record array to eeprom, the function should be
> + * protected by recovery_lock
> + */
> +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_err_handler_data *data;
> + struct amdgpu_ras_eeprom_control *control =
> + &adev->psp.ras.ras->eeprom_control;
> + int save_count;
> +
> + if (!con || !con->eh_data)
> + return 0;
> +
> + data = con->eh_data;
> + if (!data)
> + return 0;
> [Guchun]Such check (!data) is redundant and not needed. As we have
> checked !con->eh_data earlier, and the whole function is protected by
> recovery_lock.
[Tao] OK, I'll remove it.
>
> + save_count = data->count - control->num_recs;
> + /* only new entries are saved */
> + if (save_count > 0)
> + if (amdgpu_ras_eeprom_process_recods(&con-
> >eeprom_control,
> + &data->bps[control-
> >num_recs],
> + true,
> + save_count)) {
> + DRM_ERROR("Failed to save EEPROM table data!");
> + return -EIO;
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * read error record array in eeprom and reserve enough space for
> + * storing new bad pages
> + */
> +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) {
> + struct amdgpu_ras_eeprom_control *control =
> + &adev->psp.ras.ras->eeprom_control;
> + struct eeprom_table_record *bps = NULL;
> + int ret = 0;
> +
> + /* no bad page record, skip eeprom access */
> + if (!control->num_recs)
> + return ret;
> +
> + bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
> + if (!bps)
> + return -ENOMEM;
> +
> + if (amdgpu_ras_eeprom_process_recods(control, bps, false,
> + control->num_recs)) {
> + DRM_ERROR("Failed to load EEPROM table records!");
> + ret = -EIO;
> + goto out;
> + }
> +
> + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
> +
> +out:
> + kfree(bps);
> + return ret;
> +}
> +
> /* called in gpu recovery/init */
> int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ -
> 1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
> struct ras_err_handler_data *data;
> uint64_t bp;
> struct amdgpu_bo *bo;
> - int i;
> + int i, ret = 0;
>
> if (!con || !con->eh_data)
> return 0;
> @@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
> data->bps_bo[i] = bo;
> data->last_reserved = i + 1;
> }
> +
> + ret = amdgpu_ras_save_bad_pages(adev);
> out:
> mutex_unlock(&con->recovery_lock);
> - return 0;
> + return ret;
> }
>
> /* called when driver unload */
> @@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct
> amdgpu_device *adev)
> return 0;
> }
>
> -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{
> - /* TODO
> - * write the array to eeprom when SMU disabled.
> - */
> - return 0;
> -}
> -
> -/*
> - * read error record array in eeprom and reserve enough space for
> - * storing new bad pages
> - */
> -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{
> - struct eeprom_table_record *bps = NULL;
> - int ret;
> -
> - ret = amdgpu_ras_add_bad_pages(adev, bps,
> - adev->umc.max_ras_err_cnt_per_query);
> -
> - return ret;
> -}
> -
> static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct ras_err_handler_data **data = &con->eh_data;
> + int ret;
>
> *data = kmalloc(sizeof(**data),
> GFP_KERNEL|__GFP_ZERO);
> @@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct
> amdgpu_device *adev)
> atomic_set(&con->in_recovery, 0);
> con->adev = adev;
>
> - amdgpu_ras_load_bad_pages(adev);
> - amdgpu_ras_reserve_bad_pages(adev);
> + ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras-
> >eeprom_control);
> + if (ret)
> + return ret;
> +
> + if (adev->psp.ras.ras->eeprom_control.num_recs) {
> + ret = amdgpu_ras_load_bad_pages(adev);
> + if (ret)
> + return ret;
> + ret = amdgpu_ras_reserve_bad_pages(adev);
> + if (ret)
> + return ret;
> + }
>
> return 0;
> }
> @@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct
> amdgpu_device *adev)
> struct ras_err_handler_data *data = con->eh_data;
>
> cancel_work_sync(&con->recovery_work);
> - amdgpu_ras_save_bad_pages(adev);
> amdgpu_ras_release_bad_pages(adev);
>
> mutex_lock(&con->recovery_lock);
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list