[PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS

Mon Sep 2 03:00:24 UTC 2019

> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen at amd.com>
> Sent: 2019年9月2日 10:11
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org;
> Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Li, Dennis
> <Dennis.Li at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
> 
> 
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Friday, August 30, 2019 8:25 PM
> To: amd-gfx at lists.freedesktop.org; Grodzovsky, Andrey
> <Andrey.Grodzovsky at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>;
> Li, Dennis <Dennis.Li at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
> 
> support eeprom records load and save for ras, move EEPROM records
> storing to bad page reserving
> 
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++--
> ----
>  1 file changed, 83 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 24663ec41248..02120aa3cb5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
>  	return ret;
>  }
> 
> +/*
> + * write error record array to eeprom, the function should be
> + * protected by recovery_lock
> + */
> +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) {
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	struct ras_err_handler_data *data;
> +	struct amdgpu_ras_eeprom_control *control =
> +					&adev->psp.ras.ras->eeprom_control;
> +	int save_count;
> +
> +	if (!con || !con->eh_data)
> +		return 0;
> +
> +	data = con->eh_data;
> +	if (!data)
> +		return 0;
> [Guchun]Such check (!data) is redundant and not needed. As we have
> checked !con->eh_data earlier, and the whole function is protected by
> recovery_lock.

[Tao] OK, I'll remove it.

> 
> +	save_count = data->count - control->num_recs;
> +	/* only new entries are saved */
> +	if (save_count > 0)
> +		if (amdgpu_ras_eeprom_process_recods(&con-
> >eeprom_control,
> +							&data->bps[control-
> >num_recs],
> +							true,
> +							save_count)) {
> +			DRM_ERROR("Failed to save EEPROM table data!");
> +			return -EIO;
> +		}
> +
> +	return 0;
> +}
> +
> +/*
> + * read error record array in eeprom and reserve enough space for
> + * storing new bad pages
> + */
> +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) {
> +	struct amdgpu_ras_eeprom_control *control =
> +					&adev->psp.ras.ras->eeprom_control;
> +	struct eeprom_table_record *bps = NULL;
> +	int ret = 0;
> +
> +	/* no bad page record, skip eeprom access */
> +	if (!control->num_recs)
> +		return ret;
> +
> +	bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
> +	if (!bps)
> +		return -ENOMEM;
> +
> +	if (amdgpu_ras_eeprom_process_recods(control, bps, false,
> +		control->num_recs)) {
> +		DRM_ERROR("Failed to load EEPROM table records!");
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
> +
> +out:
> +	kfree(bps);
> +	return ret;
> +}
> +
>  /* called in gpu recovery/init */
>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)  { @@ -
> 1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
>  	struct ras_err_handler_data *data;
>  	uint64_t bp;
>  	struct amdgpu_bo *bo;
> -	int i;
> +	int i, ret = 0;
> 
>  	if (!con || !con->eh_data)
>  		return 0;
> @@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
>  		data->bps_bo[i] = bo;
>  		data->last_reserved = i + 1;
>  	}
> +
> +	ret = amdgpu_ras_save_bad_pages(adev);
>  out:
>  	mutex_unlock(&con->recovery_lock);
> -	return 0;
> +	return ret;
>  }
> 
>  /* called when driver unload */
> @@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct
> amdgpu_device *adev)
>  	return 0;
>  }
> 
> -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{
> -	/* TODO
> -	 * write the array to eeprom when SMU disabled.
> -	 */
> -	return 0;
> -}
> -
> -/*
> - * read error record array in eeprom and reserve enough space for
> - * storing new bad pages
> - */
> -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{
> -	struct eeprom_table_record *bps = NULL;
> -	int ret;
> -
> -	ret = amdgpu_ras_add_bad_pages(adev, bps,
> -				adev->umc.max_ras_err_cnt_per_query);
> -
> -	return ret;
> -}
> -
>  static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  {
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  	struct ras_err_handler_data **data = &con->eh_data;
> +	int ret;
> 
>  	*data = kmalloc(sizeof(**data),
>  			GFP_KERNEL|__GFP_ZERO);
> @@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct
> amdgpu_device *adev)
>  	atomic_set(&con->in_recovery, 0);
>  	con->adev = adev;
> 
> -	amdgpu_ras_load_bad_pages(adev);
> -	amdgpu_ras_reserve_bad_pages(adev);
> +	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras-
> >eeprom_control);
> +	if (ret)
> +		return ret;
> +
> +	if (adev->psp.ras.ras->eeprom_control.num_recs) {
> +		ret = amdgpu_ras_load_bad_pages(adev);
> +		if (ret)
> +			return ret;
> +		ret = amdgpu_ras_reserve_bad_pages(adev);
> +		if (ret)
> +			return ret;
> +	}
> 
>  	return 0;
>  }
> @@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct
> amdgpu_device *adev)
>  	struct ras_err_handler_data *data = con->eh_data;
> 
>  	cancel_work_sync(&con->recovery_work);
> -	amdgpu_ras_save_bad_pages(adev);
>  	amdgpu_ras_release_bad_pages(adev);
> 
>  	mutex_lock(&con->recovery_lock);
> --
> 2.17.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx