[PATCH] drm/amdgpu: save error count in RAS poison handler
Zhou1, Tao
Tao.Zhou1 at amd.com
Tue Dec 21 07:01:46 UTC 2021
[AMD Official Use Only]
> -----Original Message-----
> From: Yang, Stanley <Stanley.Yang at amd.com>
> Sent: Tuesday, December 21, 2021 2:05 PM
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org; Zhang,
> Hawking <Hawking.Zhang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
> Subject: 回复: [PATCH] drm/amdgpu: save error count in RAS poison handler
>
> [AMD Official Use Only]
>
> > +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) {
> > + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)
> > &&
> > + adev->umc.ras_if) {
> > + struct ras_common_if *ras_if = adev->umc.ras_if;
> > + struct ras_ih_if ih_info = {
> > + .head = *ras_if,
> > + .cb = amdgpu_umc_process_ras_data_cb,
> > + };
> > +
> > + amdgpu_ras_late_fini(adev, ras_if, &ih_info);
> > + kfree(ras_if);
> > + }
> > +}
> > +
> > +
> > +
> [Yang, Stanley] it's better remove extra blank lines.
> > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> > struct amdgpu_irq_src *source,
> > struct amdgpu_iv_entry *entry)
>
> Other than above, patch is reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>
>
> > -----邮件原件-----
> > 发件人: Zhou1, Tao <Tao.Zhou1 at amd.com>
> > 发送时间: Monday, December 20, 2021 4:51 PM
> > 收件人: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> > <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai,
> > Thomas <YiPeng.Chai at amd.com>
> > 抄送: Zhou1, Tao <Tao.Zhou1 at amd.com>
> > 主题: [PATCH] drm/amdgpu: save error count in RAS poison handler
> >
> > Otherwise the RAS error count couldn't be queried from sysfs.
> >
> > Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +-
> > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 170 ++++++++++++------
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 +-
> > 3 files changed, 99 insertions(+), 76 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > index 0bf09a94d944..776a947b45df 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > @@ -727,7 +727,7 @@ void
> > amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device
> > *adev, bo
> >
> > /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
> > if (!adev->gmc.xgmi.connected_to_cpu)
> > - amdgpu_umc_do_page_retirement(adev, &err_data, NULL,
> > reset);
> > + amdgpu_umc_poison_handler(adev, &err_data, reset);
> > else if (reset)
> > amdgpu_amdkfd_gpu_reset(adev);
> > }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > index 0c33f367a4e5..1c2dbd00f647 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > @@ -23,79 +23,7 @@
> >
> > #include "amdgpu_ras.h"
> >
> > -static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
> > - void *ras_error_status,
> > - struct amdgpu_iv_entry *entry)
> > -{
> > - return amdgpu_umc_do_page_retirement(adev, ras_error_status,
> > entry, true);
> > -}
> > -
> > -int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) -{
> > - int r;
> > - struct ras_fs_if fs_info = {
> > - .sysfs_name = "umc_err_count",
> > - };
> > - struct ras_ih_if ih_info = {
> > - .cb = amdgpu_umc_process_ras_data_cb,
> > - };
> > -
> > - if (!adev->umc.ras_if) {
> > - adev->umc.ras_if =
> > - kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
> > - if (!adev->umc.ras_if)
> > - return -ENOMEM;
> > - adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
> > - adev->umc.ras_if->type =
> > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> > - adev->umc.ras_if->sub_block_index = 0;
> > - }
> > - ih_info.head = fs_info.head = *adev->umc.ras_if;
> > -
> > - r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
> > - &fs_info, &ih_info);
> > - if (r)
> > - goto free;
> > -
> > - if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
> > - r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
> > - if (r)
> > - goto late_fini;
> > - } else {
> > - r = 0;
> > - goto free;
> > - }
> > -
> > - /* ras init of specific umc version */
> > - if (adev->umc.ras_funcs &&
> > - adev->umc.ras_funcs->err_cnt_init)
> > - adev->umc.ras_funcs->err_cnt_init(adev);
> > -
> > - return 0;
> > -
> > -late_fini:
> > - amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
> > -free:
> > - kfree(adev->umc.ras_if);
> > - adev->umc.ras_if = NULL;
> > - return r;
> > -}
> > -
> > -void amdgpu_umc_ras_fini(struct amdgpu_device *adev) -{
> > - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)
> > &&
> > - adev->umc.ras_if) {
> > - struct ras_common_if *ras_if = adev->umc.ras_if;
> > - struct ras_ih_if ih_info = {
> > - .head = *ras_if,
> > - .cb = amdgpu_umc_process_ras_data_cb,
> > - };
> > -
> > - amdgpu_ras_late_fini(adev, ras_if, &ih_info);
> > - kfree(ras_if);
> > - }
> > -}
> > -
> > -int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> > +static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> > void *ras_error_status,
> > struct amdgpu_iv_entry *entry,
> > bool reset)
> > @@ -180,6 +108,102 @@ int amdgpu_umc_do_page_retirement(struct
> > amdgpu_device *adev,
> > return AMDGPU_RAS_SUCCESS;
> > }
> >
> > +int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
> > + void *ras_error_status,
> > + bool reset)
> > +{
> > + int ret;
> > + struct ras_err_data *err_data = (struct ras_err_data
> > *)ras_error_status;
> > + struct ras_common_if head = {
> > + .block = AMDGPU_RAS_BLOCK__UMC,
> > + };
> > + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
> > +
> > + ret =
> > + amdgpu_umc_do_page_retirement(adev, ras_error_status,
> > NULL, reset);
> > +
> > + if (ret == AMDGPU_RAS_SUCCESS && obj) {
> > + obj->err_data.ue_count += err_data->ue_count;
> > + obj->err_data.ce_count += err_data->ce_count;
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
> > + void *ras_error_status,
> > + struct amdgpu_iv_entry *entry)
> > +{
> > + return amdgpu_umc_do_page_retirement(adev, ras_error_status,
> > entry,
> > +true); }
> > +
> > +int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) {
> > + int r;
> > + struct ras_fs_if fs_info = {
> > + .sysfs_name = "umc_err_count",
> > + };
> > + struct ras_ih_if ih_info = {
> > + .cb = amdgpu_umc_process_ras_data_cb,
> > + };
> > +
> > + if (!adev->umc.ras_if) {
> > + adev->umc.ras_if =
> > + kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
> > + if (!adev->umc.ras_if)
> > + return -ENOMEM;
> > + adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
> > + adev->umc.ras_if->type =
> > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> > + adev->umc.ras_if->sub_block_index = 0;
> > + }
> > + ih_info.head = fs_info.head = *adev->umc.ras_if;
> > +
> > + r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
> > + &fs_info, &ih_info);
> > + if (r)
> > + goto free;
> > +
> > + if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
> > + r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
> > + if (r)
> > + goto late_fini;
> > + } else {
> > + r = 0;
> > + goto free;
> > + }
> > +
> > + /* ras init of specific umc version */
> > + if (adev->umc.ras_funcs &&
> > + adev->umc.ras_funcs->err_cnt_init)
> > + adev->umc.ras_funcs->err_cnt_init(adev);
> > +
> > + return 0;
> > +
> > +late_fini:
> > + amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
> > +free:
> > + kfree(adev->umc.ras_if);
> > + adev->umc.ras_if = NULL;
> > + return r;
> > +}
> > +
> > +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) {
> > + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)
> > &&
> > + adev->umc.ras_if) {
> > + struct ras_common_if *ras_if = adev->umc.ras_if;
> > + struct ras_ih_if ih_info = {
> > + .head = *ras_if,
> > + .cb = amdgpu_umc_process_ras_data_cb,
> > + };
> > +
> > + amdgpu_ras_late_fini(adev, ras_if, &ih_info);
> > + kfree(ras_if);
> > + }
> > +}
> > +
> > +
> > +
> [Yang, Stanley] it's better remove extra blank lines.
[Tao] Thanks for your reminder, I'll remove them before push.
>
> > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> > struct amdgpu_irq_src *source,
> > struct amdgpu_iv_entry *entry)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > index 8d18d5121f66..b72194e8bfe5 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > @@ -78,9 +78,8 @@ struct amdgpu_umc {
> >
> > int amdgpu_umc_ras_late_init(struct amdgpu_device *adev); void
> > amdgpu_umc_ras_fini(struct amdgpu_device *adev); -int
> > amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> > +int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
> > void *ras_error_status,
> > - struct amdgpu_iv_entry *entry,
> > bool reset);
> > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> > struct amdgpu_irq_src *source,
> > --
> > 2.17.1
More information about the amd-gfx
mailing list