[PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
Zhou1, Tao
Tao.Zhou1 at amd.com
Fri Oct 21 02:53:43 UTC 2022
[AMD Official Use Only - General]
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Thursday, October 20, 2022 5:13 PM
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org; Yang,
> Stanley <Stanley.Yang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Li,
> Candice <Candice.Li at amd.com>
> Subject: RE: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for
> MCA
>
> [AMD Official Use Only - General]
>
> +static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
> + struct ras_err_data *err_data, bool reset)
>
>
> + if (adev->gmc.xgmi.connected_to_cpu) {
> + ret = amdgpu_umc_poison_handler_mca(adev,
> ras_error_status, reset);
>
> The input parameters "reset" and "err_data" can be dropped since
> amdgpu_umc_poison_handler_mca is dedicated to MCA platform
[Tao] whether need to do gpu reset is determined by unmap queue status, so reset parameter can't be dropped.
For "err_data", it can be removed currently, but I'm afraid we may need it on other ASICs in the future.
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Sent: Wednesday, October 19, 2022 16:12
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai,
> Thomas <YiPeng.Chai at amd.com>; Li, Candice <Candice.Li at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 1/4] drm/amdgpu: add RAS page retirement functions for MCA
>
> Define page retirement functions for MCA platform.
>
> v2: remove page retirement handling from MCA poison handler,
> let MCA notifier do page retirement.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 67
> +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> | 2 +
> 2 files changed, 69 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index aad3c8b4c810..9494fa14db9a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -22,6 +22,73 @@
> */
>
> #include "amdgpu.h"
> +#include "umc_v6_7.h"
> +
> +static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
> + struct ras_err_data *err_data, uint64_t
> err_addr,
> + uint32_t ch_inst, uint32_t umc_inst) {
> + switch (adev->ip_versions[UMC_HWIP][0]) {
> + case IP_VERSION(6, 7, 0):
> + umc_v6_7_convert_error_address(adev,
> + err_data, err_addr, ch_inst, umc_inst);
> + break;
> + default:
> + dev_warn(adev->dev,
> + "UMC address to Physical address translation is not
> supported\n");
> + return AMDGPU_RAS_FAIL;
> + }
> +
> + return AMDGPU_RAS_SUCCESS;
> +}
> +
> +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
> + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
> {
> + struct ras_err_data err_data = {0, 0, 0, NULL};
> + int ret = AMDGPU_RAS_FAIL;
> +
> + err_data.err_addr =
> + kcalloc(adev->umc.max_ras_err_cnt_per_query,
> + sizeof(struct eeprom_table_record), GFP_KERNEL);
> + if (!err_data.err_addr) {
> + dev_warn(adev->dev,
> + "Failed to alloc memory for umc error record in MCA
> notifier!\n");
> + return AMDGPU_RAS_FAIL;
> + }
> +
> + /*
> + * Translate UMC channel address to Physical address
> + */
> + ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
> + ch_inst, umc_inst);
> + if (ret)
> + goto out;
> +
> + if (amdgpu_bad_page_threshold != 0) {
> + amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
> + err_data.err_addr_cnt);
> + amdgpu_ras_save_bad_pages(adev);
> + }
> +
> +out:
> + kfree(err_data.err_addr);
> + return ret;
> +}
> +
> +static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev,
> + struct ras_err_data *err_data, bool reset) {
> + /* MCA poison handler is only responsible for GPU reset,
> + * let MCA notifier do page retirement.
> + */
> + if (reset) {
> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> + amdgpu_ras_reset_gpu(adev);
> + }
> +
> + return AMDGPU_RAS_SUCCESS;
> +}
>
> static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> void *ras_error_status,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index 3629d8f292ef..659a10de29c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data
> *err_data, int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
> void *ras_error_status,
> struct amdgpu_iv_entry *entry);
> +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
> + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
> #endif
> --
> 2.35.1
More information about the amd-gfx
mailing list