[PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create
Zhou1, Tao
Tao.Zhou1 at amd.com
Mon Sep 30 03:43:20 UTC 2019
> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen at amd.com>
> Sent: 2019年9月30日 11:26
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org;
> Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Subject: RE: [PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo
> create
>
>
> Regards,
> Guchun
>
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Sent: Monday, September 30, 2019 11:20 AM
> To: amd-gfx at lists.freedesktop.org; Grodzovsky, Andrey
> <Andrey.Grodzovsky at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>;
> Zhang, Hawking <Hawking.Zhang at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: reuse code of ras bad page's bo create
>
> implement ras_create_bad_pages_bo to simplify ras code
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 72 +++++++++++--------------
> 1 file changed, 31 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d1bafa92ca91..fe3a57e567c8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1430,41 +1430,53 @@ static int amdgpu_ras_load_bad_pages(struct
> amdgpu_device *adev)
> return ret;
> }
>
> -/* called in gpu recovery/init */
> -int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
> +static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)
> {
> + /* Note: the caller should guarantee con and data are not NULL */
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> - struct ras_err_handler_data *data;
> + struct ras_err_handler_data *data = con->eh_data;
> uint64_t bp;
> - struct amdgpu_bo *bo = NULL;
> - int i, ret = 0;
> -
> - if (!con || !con->eh_data)
> - return 0;
> + struct amdgpu_bo *bo;
> + int i;
>
> - mutex_lock(&con->recovery_lock);
> - data = con->eh_data;
> - if (!data)
> - goto out;
> - /* reserve vram at driver post stage. */
> for (i = data->last_reserved; i < data->count; i++) {
> + bo = NULL;
> bp = data->bps[i].retired_page;
>
> - /* There are two cases of reserve error should be ignored:
> + /* There are three cases of reserve error should be ignored:
> * 1) a ras bad page has been allocated (used by someone);
> * 2) a ras bad page has been reserved (duplicate error
> injection
> * for one page);
> + * 3) bo info isn't lost in gpu reset
> */
> if (amdgpu_bo_create_kernel_at(adev, bp <<
> AMDGPU_GPU_PAGE_SHIFT,
> AMDGPU_GPU_PAGE_SIZE,
> AMDGPU_GEM_DOMAIN_VRAM,
> &bo, NULL))
> - DRM_WARN("RAS WARN: reserve vram for retired
> page %llx fail\n", bp);
> -
> - data->bps_bo[i] = bo;
> + DRM_NOTE("RAS NOTE: reserve vram for retired
> page 0x%llx fail\n", bp);
> + else
> + data->bps_bo[i] = bo;
> [Guchun]The "else" should not needed? Otherwise, if
> amdgpu_bo_create_kernel_at always succeeds, we don't catch a chance to
> update bps_bo.
> Is that true?
[Tao] It's intentional. There is only a DRM_NOTE if amdgpu_bo_create_kernel_at fails (the parameter bo is NULL under this condition), and bps_bo contains previous value (or NULL).
We update bps_bo only if amdgpu_bo_create_kernel_at succeeds.
> data->last_reserved = i + 1;
> - bo = NULL;
> }
> +}
> +
> +/* called in gpu recovery/init */
> +int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_err_handler_data *data;
> + int ret = 0;
> +
> + if (!con || !con->eh_data)
> + return 0;
> +
> + mutex_lock(&con->recovery_lock);
> + data = con->eh_data;
> + if (!data)
> + goto out;
> +
> + /* reserve vram at driver post stage. */
> + amdgpu_ras_create_bad_pages_bo(adev);
>
> /* continue to save bad pages to eeprom even reesrve_vram fails */
> ret = amdgpu_ras_save_bad_pages(adev); @@ -1583,9 +1595,6 @@
> void amdgpu_ras_recovery_reset(struct amdgpu_device *adev) {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct ras_err_handler_data *data;
> - uint64_t bp;
> - struct amdgpu_bo *bo = NULL;
> - int i;
>
> if (!con || !con->eh_data)
> return;
> @@ -1600,26 +1609,7 @@ void amdgpu_ras_recovery_reset(struct
> amdgpu_device *adev)
> data = con->eh_data;
> /* force to reserve all retired page again */
> data->last_reserved = 0;
> -
> - for (i = data->last_reserved; i < data->count; i++) {
> - bp = data->bps[i].retired_page;
> -
> - /* There are three cases of reserve error should be ignored:
> - * 1) a ras bad page has been allocated (used by someone);
> - * 2) a ras bad page has been reserved (duplicate error
> injection
> - * for one page);
> - * 3) bo info isn't lost in gpu reset
> - */
> - if (amdgpu_bo_create_kernel_at(adev, bp <<
> AMDGPU_GPU_PAGE_SHIFT,
> - AMDGPU_GPU_PAGE_SIZE,
> - AMDGPU_GEM_DOMAIN_VRAM,
> - &bo, NULL))
> - DRM_NOTE("RAS NOTE: recreate bo for retired page
> 0x%llx fail\n", bp);
> - else
> - data->bps_bo[i] = bo;
> - data->last_reserved = i + 1;
> - bo = NULL;
> - }
> + amdgpu_ras_create_bad_pages_bo(adev);
> }
> /* recovery end */
>
> --
> 2.17.1
More information about the amd-gfx
mailing list