[PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)

Zhou1, Tao Tao.Zhou1 at amd.com
Fri Apr 22 04:03:51 UTC 2022


[AMD Official Use Only]

Hi Hawking,

The logic in my patch is:

if (poison)
    if (umc)
        poison_creation_handler
    else
        poison_consumption_handler
else
    if (umc)
        umc_handler
    else
        not supported

I think your suggestion is:

if (umc)
    if (poison)
        poison_creation_handler
    else
        umc_handler
else
    if (poiosn)
        poison_consumption_handler
    else
        not supported

Either way is OK for me, I don't think one approach is better than another.

Regards,
Tao

> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Thursday, April 21, 2022 10:54 PM
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org; Yang,
> Stanley <Stanley.Yang at amd.com>; Lazar, Lijo <Lijo.Lazar at amd.com>; Ziya,
> Mohammad zafar <Mohammadzafar.Ziya at amd.com>; Chai, Thomas
> <YiPeng.Chai at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: RE: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)
> 
> [AMD Official Use Only]
> 
> Hi Tao,
> 
> I was thinking more aggressive change - current amdgpu_ras_interrupt_handler
> only serves as umc poison (poison mode) or uncorrectable error handler (fue
> mode).
> 
> We can still keep it as a unified entry point, but how about check ip block first,
> then if it is umc, then check poison mode to decide poison creation handling or
> legacy uncorrectable error handling.
> 
> As discussed before, we'll optimize the poison creation handling later. so
> keeping poison_creation_handler and legacy umc ue handler in separated
> functions seems right direction to me.
> 
> Thoughts?
> 
> Regards,
> Hawking
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Wednesday, April 20, 2022 19:30
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Lazar,
> Lijo <Lijo.Lazar at amd.com>; Ziya, Mohammad zafar
> <Mohammadzafar.Ziya at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)
> 
> Prepare for the implementation of poison consumption handler.
> 
> v2: separate umc handler from poison creation.
> 
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 ++++++++++++++++---------
>  1 file changed, 44 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 4bbed76b79c8..22477f76913a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct
> amdgpu_device *adev)
>  /* ras fs end */
> 
>  /* ih begin */
> +static void amdgpu_ras_interrupt_poison_creation_handler(struct
> ras_manager *obj,
> +                               struct amdgpu_iv_entry *entry) {
> +       dev_info(obj->adev->dev,
> +               "Poison is created, no user action is needed.\n"); }
> +
> +static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
> +                               struct amdgpu_iv_entry *entry) {
> +       struct ras_ih_data *data = &obj->ih_data;
> +       struct ras_err_data err_data = {0, 0, 0, NULL};
> +       int ret;
> +
> +       if (!data->cb)
> +               return;
> +
> +       /* Let IP handle its data, maybe we need get the output
> +        * from the callback to update the error type/count, etc
> +        */
> +       ret = data->cb(obj->adev, &err_data, entry);
> +       /* ue will trigger an interrupt, and in that case
> +        * we need do a reset to recovery the whole system.
> +        * But leave IP do that recovery, here we just dispatch
> +        * the error.
> +        */
> +       if (ret == AMDGPU_RAS_SUCCESS) {
> +               /* these counts could be left as 0 if
> +                * some blocks do not count error number
> +                */
> +               obj->err_data.ue_count += err_data.ue_count;
> +               obj->err_data.ce_count += err_data.ce_count;
> +       }
> +}
> +
>  static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)  {
>         struct ras_ih_data *data = &obj->ih_data;
>         struct amdgpu_iv_entry entry;
> -       int ret;
> -       struct ras_err_data err_data = {0, 0, 0, NULL};
> 
>         while (data->rptr != data->wptr) {
>                 rmb();
> @@ -1531,30 +1564,15 @@ static void amdgpu_ras_interrupt_handler(struct
> ras_manager *obj)
>                 data->rptr = (data->aligned_element_size +
>                                 data->rptr) % data->ring_size;
> 
> -               if (data->cb) {
> -                       if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
> -                           obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> -                               dev_info(obj->adev->dev,
> -                                               "Poison is created, no user action is needed.\n");
> -                       else {
> -                               /* Let IP handle its data, maybe we need get the output
> -                                * from the callback to udpate the error type/count, etc
> -                                */
> -                               memset(&err_data, 0, sizeof(err_data));
> -                               ret = data->cb(obj->adev, &err_data, &entry);
> -                               /* ue will trigger an interrupt, and in that case
> -                                * we need do a reset to recovery the whole system.
> -                                * But leave IP do that recovery, here we just dispatch
> -                                * the error.
> -                                */
> -                               if (ret == AMDGPU_RAS_SUCCESS) {
> -                                       /* these counts could be left as 0 if
> -                                        * some blocks do not count error number
> -                                        */
> -                                       obj->err_data.ue_count += err_data.ue_count;
> -                                       obj->err_data.ce_count += err_data.ce_count;
> -                               }
> -                       }
> +               if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
> +                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> +                               amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
> +               } else {
> +                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> +                               amdgpu_ras_interrupt_umc_handler(obj, &entry);
> +                       else
> +                               dev_warn(obj->adev->dev,
> +                                       "No RAS interrupt handler for
> +non-UMC block with poison disabled.\n");
>                 }
>         }
>  }
> --
> 2.35.1
> 


More information about the amd-gfx mailing list