[PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create

Mon Sep 30 03:43:20 UTC 2019

> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen at amd.com>
> Sent: 2019年9月30日 11:26
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org;
> Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Subject: RE: [PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo
> create
> 
> 
> Regards,
> Guchun
> 
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Sent: Monday, September 30, 2019 11:20 AM
> To: amd-gfx at lists.freedesktop.org; Grodzovsky, Andrey
> <Andrey.Grodzovsky at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>;
> Zhang, Hawking <Hawking.Zhang at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create
> 
> implement ras_create_bad_pages_bo to simplify ras code
> 
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 72 +++++++++++--------------
>  1 file changed, 31 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d1bafa92ca91..fe3a57e567c8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1430,41 +1430,53 @@ static int amdgpu_ras_load_bad_pages(struct
> amdgpu_device *adev)
>  	return ret;
>  }
> 
> -/* called in gpu recovery/init */
> -int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
> +static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)
>  {
> +	/* Note: the caller should guarantee con and data are not NULL */
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> -	struct ras_err_handler_data *data;
> +	struct ras_err_handler_data *data = con->eh_data;
>  	uint64_t bp;
> -	struct amdgpu_bo *bo = NULL;
> -	int i, ret = 0;
> -
> -	if (!con || !con->eh_data)
> -		return 0;
> +	struct amdgpu_bo *bo;
> +	int i;
> 
> -	mutex_lock(&con->recovery_lock);
> -	data = con->eh_data;
> -	if (!data)
> -		goto out;
> -	/* reserve vram at driver post stage. */
>  	for (i = data->last_reserved; i < data->count; i++) {
> +		bo = NULL;
>  		bp = data->bps[i].retired_page;
> 
> -		/* There are two cases of reserve error should be ignored:
> +		/* There are three cases of reserve error should be ignored:
>  		 * 1) a ras bad page has been allocated (used by someone);
>  		 * 2) a ras bad page has been reserved (duplicate error
> injection
>  		 *    for one page);
> +		 * 3) bo info isn't lost in gpu reset
>  		 */
>  		if (amdgpu_bo_create_kernel_at(adev, bp <<
> AMDGPU_GPU_PAGE_SHIFT,
>  					       AMDGPU_GPU_PAGE_SIZE,
>  					       AMDGPU_GEM_DOMAIN_VRAM,
>  					       &bo, NULL))
> -			DRM_WARN("RAS WARN: reserve vram for retired
> page %llx fail\n", bp);
> -
> -		data->bps_bo[i] = bo;
> +			DRM_NOTE("RAS NOTE: reserve vram for retired
> page 0x%llx fail\n", bp);
> +		else
> +			data->bps_bo[i] = bo;
> [Guchun]The "else" should not needed? Otherwise, if
> amdgpu_bo_create_kernel_at always succeeds, we don't catch a chance to
> update bps_bo.
> Is that true?

[Tao] It's intentional. There is only a DRM_NOTE if amdgpu_bo_create_kernel_at fails (the parameter bo is NULL under this condition), and bps_bo contains previous value (or NULL).
We update bps_bo only if amdgpu_bo_create_kernel_at succeeds.

>  		data->last_reserved = i + 1;
> -		bo = NULL;
>  	}
> +}
> +
> +/* called in gpu recovery/init */
> +int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) {
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	struct ras_err_handler_data *data;
> +	int ret = 0;
> +
> +	if (!con || !con->eh_data)
> +		return 0;
> +
> +	mutex_lock(&con->recovery_lock);
> +	data = con->eh_data;
> +	if (!data)
> +		goto out;
> +
> +	/* reserve vram at driver post stage. */
> +	amdgpu_ras_create_bad_pages_bo(adev);
> 
>  	/* continue to save bad pages to eeprom even reesrve_vram fails */
>  	ret = amdgpu_ras_save_bad_pages(adev); @@ -1583,9 +1595,6 @@
> void amdgpu_ras_recovery_reset(struct amdgpu_device *adev)  {
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  	struct ras_err_handler_data *data;
> -	uint64_t bp;
> -	struct amdgpu_bo *bo = NULL;
> -	int i;
> 
>  	if (!con || !con->eh_data)
>  		return;
> @@ -1600,26 +1609,7 @@ void amdgpu_ras_recovery_reset(struct
> amdgpu_device *adev)
>  	data = con->eh_data;
>  	/* force to reserve all retired page again */
>  	data->last_reserved = 0;
> -
> -	for (i = data->last_reserved; i < data->count; i++) {
> -		bp = data->bps[i].retired_page;
> -
> -		/* There are three cases of reserve error should be ignored:
> -		 * 1) a ras bad page has been allocated (used by someone);
> -		 * 2) a ras bad page has been reserved (duplicate error
> injection
> -		 *    for one page);
> -		 * 3) bo info isn't lost in gpu reset
> -		 */
> -		if (amdgpu_bo_create_kernel_at(adev, bp <<
> AMDGPU_GPU_PAGE_SHIFT,
> -					       AMDGPU_GPU_PAGE_SIZE,
> -					       AMDGPU_GEM_DOMAIN_VRAM,
> -					       &bo, NULL))
> -			DRM_NOTE("RAS NOTE: recreate bo for retired page
> 0x%llx fail\n", bp);
> -		else
> -			data->bps_bo[i] = bo;
> -		data->last_reserved = i + 1;
> -		bo = NULL;
> -	}
> +	amdgpu_ras_create_bad_pages_bo(adev);
>  }
>  /* recovery end */
> 
> --
> 2.17.1