[PATCH] drm/amdgpu: Add more types for boot time error reporting

Zhou1, Tao Tao.Zhou1 at amd.com
Thu Aug 1 06:05:12 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang at amd.com>
> Sent: Thursday, August 1, 2024 1:55 PM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: Add more types for boot time error reporting
>
> Data abort exception and unknown errors are supported.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 ++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 ++
>  2 files changed, 12 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 12ab48f26bd5..7aff6150898b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4769,6 +4769,16 @@ static void
> amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
>               dev_info(adev->dev,
>                        "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm
> bist test failed\n",
>                        socket_id, aid_id, hbm_id, fw_status);
> +
> +     if (AMDGPU_RAS_GPU_ERR_DATA_ABORT(boot_error))
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, fw_status: 0x%x, data abort
> exception\n",
> +                      socket_id, aid_id, fw_status);
> +
> +     if (AMDGPU_RAS_GPU_ERR_UNKNOWN(boot_error))
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, fw_status: 0x%x, unknown boot
> time errors\n",
> +                      socket_id, aid_id, fw_status);
>  }
>
>  static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, diff -
> -git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 7ddd13d5c06b..0d49b74bfe5e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -46,6 +46,8 @@ struct amdgpu_iv_entry;
>  #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)
>       AMDGPU_GET_REG_FIELD(x, 10, 8)
>  #define AMDGPU_RAS_GPU_ERR_AID_ID(x)
>       AMDGPU_GET_REG_FIELD(x, 12, 11)
>  #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)
>       AMDGPU_GET_REG_FIELD(x, 14, 13)
> +#define AMDGPU_RAS_GPU_ERR_DATA_ABORT(x)
>       AMDGPU_GET_REG_FIELD(x, 29, 29)
> +#define AMDGPU_RAS_GPU_ERR_UNKNOWN(x)
>       AMDGPU_GET_REG_FIELD(x, 30, 30)
>
>  #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100
>  #define AMDGPU_RAS_BOOT_STEADY_STATUS                0xBA
> --
> 2.17.1



More information about the amd-gfx mailing list