[PATCH] drm/amdgpu: Update programming for boot error reporting
Zhou1, Tao
Tao.Zhou1 at amd.com
Thu May 30 09:16:48 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
One more question, do we need to consider the compatible with old FW?
> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang at amd.com>
> Sent: Thursday, May 30, 2024 4:41 PM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: Update programming for boot error reporting
>
> AMDGPU_RAS_GPU_ERR_BOOT_STATUS field is no longer valid.
> The polling sequence is also simplifed according to the latest firmware change.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 99 +++++++++++--------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +-
> 2 files changed, 46 insertions(+), 57 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index eedf2b613ac2..2c338d39cd45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4416,64 +4416,74 @@ int amdgpu_ras_error_statistic_de_count(struct
> ras_err_data *err_data,
> #define mmMP0_SMN_C2PMSG_92 0x1609C
> #define mmMP0_SMN_C2PMSG_126 0x160BE
> static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device
> *adev,
> - u32 instance, u32 boot_error)
> + u32 instance)
> {
> u32 socket_id, aid_id, hbm_id;
> - u32 reg_data;
> + u32 fw_status;
> + u32 boot_error;
> u64 reg_addr;
>
> - socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
> - aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
> - hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
> -
> /* The pattern for smn addressing in other SOC could be different from
> * the one for aqua_vanjaram. We should revisit the code if the pattern
> * is changed. In such case, replace the aqua_vanjaram implementation
> * with more common helper */
> reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
> aqua_vanjaram_encode_ext_smn_addressing(instance);
> + fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> +
> + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
> + aqua_vanjaram_encode_ext_smn_addressing(instance);
> + boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
>
> - reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> - dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status
> is 0x%x\n",
> - socket_id, aid_id, reg_data);
> + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
> + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
> + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
>
> if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
> - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory
> training failed\n",
> - socket_id, aid_id, hbm_id);
> + dev_info(adev->dev,
> + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x,
> memory training failed\n",
> + socket_id, aid_id, hbm_id, fw_status);
>
> if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
> - dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at
> boot time\n",
> - socket_id, aid_id);
> + dev_info(adev->dev,
> + "socket: %d, aid: %d, fw_status: 0x%x, firmware load
> failed at boot time\n",
> + socket_id, aid_id, fw_status);
>
> if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
> - dev_info(adev->dev, "socket: %d, aid: %d, wafl link training
> failed\n",
> - socket_id, aid_id);
> + dev_info(adev->dev,
> + "socket: %d, aid: %d, fw_status: 0x%x, wafl link training
> failed\n",
> + socket_id, aid_id, fw_status);
>
> if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
> - dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training
> failed\n",
> - socket_id, aid_id);
> + dev_info(adev->dev,
> + "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training
> failed\n",
> + socket_id, aid_id, fw_status);
>
> if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
> - dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training
> failed\n",
> - socket_id, aid_id);
> + dev_info(adev->dev,
> + "socket: %d, aid: %d, fw_status: 0x%x, usr cp link
> training failed\n",
> + socket_id, aid_id, fw_status);
>
> if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
> - dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training
> failed\n",
> - socket_id, aid_id);
> + dev_info(adev->dev,
> + "socket: %d, aid: %d, fw_status: 0x%x, usr dp link
> training failed\n",
> + socket_id, aid_id, fw_status);
>
> if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
> - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm
> memory test failed\n",
> - socket_id, aid_id, hbm_id);
> + dev_info(adev->dev,
> + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm
> memory test failed\n",
> + socket_id, aid_id, hbm_id, fw_status);
>
> if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
> - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist
> test failed\n",
> - socket_id, aid_id, hbm_id);
> + dev_info(adev->dev,
> + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm
> bist test failed\n",
> + socket_id, aid_id, hbm_id, fw_status);
> }
>
> -static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
> - u32 instance, u32 *boot_error)
> +static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
> + u32 instance)
> {
> - u32 reg_addr;
> + u64 reg_addr;
> u32 reg_data;
> int retry_loop;
>
> @@ -4482,41 +4492,22 @@ static int
> amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
>
> for (retry_loop = 0; retry_loop <
> AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
> reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> - if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) ==
> AMDGPU_RAS_BOOT_STEADY_STATUS) {
> - *boot_error = AMDGPU_RAS_BOOT_SUCEESS;
> - return 0;
> - }
> - msleep(1);
> - }
> -
> - /* The pattern for smn addressing in other SOC could be different from
> - * the one for aqua_vanjaram. We should revisit the code if the pattern
> - * is changed. In such case, replace the aqua_vanjaram implementation
> - * with more common helper */
> - reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
> - aqua_vanjaram_encode_ext_smn_addressing(instance);
> -
> - for (retry_loop = 0; retry_loop <
> AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
> - reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> - if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
> - *boot_error = reg_data;
> - return 0;
> - }
> - msleep(1);
> + if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) ==
> AMDGPU_RAS_BOOT_STEADY_STATUS)
> + return false;
> + else
> + msleep(1);
> }
>
> - *boot_error = reg_data;
> - return -ETIME;
> + return true;
> }
>
> void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32
> num_instances) {
> - u32 boot_error = 0;
> u32 i;
>
> for (i = 0; i < num_instances; i++) {
> - if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error))
> - amdgpu_ras_boot_time_error_reporting(adev, i,
> boot_error);
> + if (amdgpu_ras_boot_error_detected(adev, i))
> + amdgpu_ras_boot_time_error_reporting(adev, i);
> }
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 7021c4a66fb5..d0a125743d3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -47,12 +47,10 @@ struct amdgpu_iv_entry;
> #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)
> AMDGPU_GET_REG_FIELD(x, 10, 8)
> #define AMDGPU_RAS_GPU_ERR_AID_ID(x)
> AMDGPU_GET_REG_FIELD(x, 12, 11)
> #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)
> AMDGPU_GET_REG_FIELD(x, 14, 13)
> -#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)
> AMDGPU_GET_REG_FIELD(x, 31, 31)
>
> -#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000
> +#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100
> #define AMDGPU_RAS_BOOT_STEADY_STATUS 0xBA
> #define AMDGPU_RAS_BOOT_STATUS_MASK 0xFF
> -#define AMDGPU_RAS_BOOT_SUCEESS 0x80000000
>
> #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0)
> /* position of instance value in sub_block_index of
> --
> 2.17.1
More information about the amd-gfx
mailing list