回复: [PATCH] drm/amdgpu: save error count in RAS poison handler
Yang, Stanley
Stanley.Yang at amd.com
Tue Dec 21 06:05:28 UTC 2021
[AMD Official Use Only]
> +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) {
> + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)
> &&
> + adev->umc.ras_if) {
> + struct ras_common_if *ras_if = adev->umc.ras_if;
> + struct ras_ih_if ih_info = {
> + .head = *ras_if,
> + .cb = amdgpu_umc_process_ras_data_cb,
> + };
> +
> + amdgpu_ras_late_fini(adev, ras_if, &ih_info);
> + kfree(ras_if);
> + }
> +}
> +
> +
> +
[Yang, Stanley] it's better remove extra blank lines.
> int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> struct amdgpu_irq_src *source,
> struct amdgpu_iv_entry *entry)
Other than above, patch is reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>
> -----邮件原件-----
> 发件人: Zhou1, Tao <Tao.Zhou1 at amd.com>
> 发送时间: Monday, December 20, 2021 4:51 PM
> 收件人: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai,
> Thomas <YiPeng.Chai at amd.com>
> 抄送: Zhou1, Tao <Tao.Zhou1 at amd.com>
> 主题: [PATCH] drm/amdgpu: save error count in RAS poison handler
>
> Otherwise the RAS error count couldn't be queried from sysfs.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 170 ++++++++++++------
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 +-
> 3 files changed, 99 insertions(+), 76 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 0bf09a94d944..776a947b45df 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -727,7 +727,7 @@ void
> amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device
> *adev, bo
>
> /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
> if (!adev->gmc.xgmi.connected_to_cpu)
> - amdgpu_umc_do_page_retirement(adev, &err_data, NULL,
> reset);
> + amdgpu_umc_poison_handler(adev, &err_data, reset);
> else if (reset)
> amdgpu_amdkfd_gpu_reset(adev);
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 0c33f367a4e5..1c2dbd00f647 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -23,79 +23,7 @@
>
> #include "amdgpu_ras.h"
>
> -static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
> - void *ras_error_status,
> - struct amdgpu_iv_entry *entry)
> -{
> - return amdgpu_umc_do_page_retirement(adev, ras_error_status,
> entry, true);
> -}
> -
> -int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) -{
> - int r;
> - struct ras_fs_if fs_info = {
> - .sysfs_name = "umc_err_count",
> - };
> - struct ras_ih_if ih_info = {
> - .cb = amdgpu_umc_process_ras_data_cb,
> - };
> -
> - if (!adev->umc.ras_if) {
> - adev->umc.ras_if =
> - kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
> - if (!adev->umc.ras_if)
> - return -ENOMEM;
> - adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
> - adev->umc.ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> - adev->umc.ras_if->sub_block_index = 0;
> - }
> - ih_info.head = fs_info.head = *adev->umc.ras_if;
> -
> - r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
> - &fs_info, &ih_info);
> - if (r)
> - goto free;
> -
> - if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
> - r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
> - if (r)
> - goto late_fini;
> - } else {
> - r = 0;
> - goto free;
> - }
> -
> - /* ras init of specific umc version */
> - if (adev->umc.ras_funcs &&
> - adev->umc.ras_funcs->err_cnt_init)
> - adev->umc.ras_funcs->err_cnt_init(adev);
> -
> - return 0;
> -
> -late_fini:
> - amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
> -free:
> - kfree(adev->umc.ras_if);
> - adev->umc.ras_if = NULL;
> - return r;
> -}
> -
> -void amdgpu_umc_ras_fini(struct amdgpu_device *adev) -{
> - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)
> &&
> - adev->umc.ras_if) {
> - struct ras_common_if *ras_if = adev->umc.ras_if;
> - struct ras_ih_if ih_info = {
> - .head = *ras_if,
> - .cb = amdgpu_umc_process_ras_data_cb,
> - };
> -
> - amdgpu_ras_late_fini(adev, ras_if, &ih_info);
> - kfree(ras_if);
> - }
> -}
> -
> -int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> +static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> void *ras_error_status,
> struct amdgpu_iv_entry *entry,
> bool reset)
> @@ -180,6 +108,102 @@ int amdgpu_umc_do_page_retirement(struct
> amdgpu_device *adev,
> return AMDGPU_RAS_SUCCESS;
> }
>
> +int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
> + void *ras_error_status,
> + bool reset)
> +{
> + int ret;
> + struct ras_err_data *err_data = (struct ras_err_data
> *)ras_error_status;
> + struct ras_common_if head = {
> + .block = AMDGPU_RAS_BLOCK__UMC,
> + };
> + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
> +
> + ret =
> + amdgpu_umc_do_page_retirement(adev, ras_error_status,
> NULL, reset);
> +
> + if (ret == AMDGPU_RAS_SUCCESS && obj) {
> + obj->err_data.ue_count += err_data->ue_count;
> + obj->err_data.ce_count += err_data->ce_count;
> + }
> +
> + return ret;
> +}
> +
> +static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
> + void *ras_error_status,
> + struct amdgpu_iv_entry *entry)
> +{
> + return amdgpu_umc_do_page_retirement(adev, ras_error_status,
> entry,
> +true); }
> +
> +int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) {
> + int r;
> + struct ras_fs_if fs_info = {
> + .sysfs_name = "umc_err_count",
> + };
> + struct ras_ih_if ih_info = {
> + .cb = amdgpu_umc_process_ras_data_cb,
> + };
> +
> + if (!adev->umc.ras_if) {
> + adev->umc.ras_if =
> + kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
> + if (!adev->umc.ras_if)
> + return -ENOMEM;
> + adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
> + adev->umc.ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> + adev->umc.ras_if->sub_block_index = 0;
> + }
> + ih_info.head = fs_info.head = *adev->umc.ras_if;
> +
> + r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
> + &fs_info, &ih_info);
> + if (r)
> + goto free;
> +
> + if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
> + r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
> + if (r)
> + goto late_fini;
> + } else {
> + r = 0;
> + goto free;
> + }
> +
> + /* ras init of specific umc version */
> + if (adev->umc.ras_funcs &&
> + adev->umc.ras_funcs->err_cnt_init)
> + adev->umc.ras_funcs->err_cnt_init(adev);
> +
> + return 0;
> +
> +late_fini:
> + amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
> +free:
> + kfree(adev->umc.ras_if);
> + adev->umc.ras_if = NULL;
> + return r;
> +}
> +
> +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) {
> + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)
> &&
> + adev->umc.ras_if) {
> + struct ras_common_if *ras_if = adev->umc.ras_if;
> + struct ras_ih_if ih_info = {
> + .head = *ras_if,
> + .cb = amdgpu_umc_process_ras_data_cb,
> + };
> +
> + amdgpu_ras_late_fini(adev, ras_if, &ih_info);
> + kfree(ras_if);
> + }
> +}
> +
> +
> +
[Yang, Stanley] it's better remove extra blank lines.
> int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> struct amdgpu_irq_src *source,
> struct amdgpu_iv_entry *entry)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index 8d18d5121f66..b72194e8bfe5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -78,9 +78,8 @@ struct amdgpu_umc {
>
> int amdgpu_umc_ras_late_init(struct amdgpu_device *adev); void
> amdgpu_umc_ras_fini(struct amdgpu_device *adev); -int
> amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> +int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
> void *ras_error_status,
> - struct amdgpu_iv_entry *entry,
> bool reset);
> int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> struct amdgpu_irq_src *source,
> --
> 2.17.1
More information about the amd-gfx
mailing list