[PATCH 3/4] drm/amdgpu: add ras POSION_CONSUMPTION event id support

Wang, Yang(Kevin) KevinYang.Wang at amd.com
Wed Jul 3 08:45:21 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Wednesday, July 3, 2024 4:28 PM
To: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
Subject: RE: [PATCH 3/4] drm/amdgpu: add ras POSION_CONSUMPTION event id support

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
> Sent: Wednesday, July 3, 2024 1:52 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>
> Subject: [PATCH 3/4] drm/amdgpu: add ras POSION_CONSUMPTION event id
> support
>
> add amdgpu ras POSION_CONSUMPTION event id support.
>
> Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c         | 16 +++++++++++++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h         |  1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 15 ++++++++++++---
>  3 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 8a98611d2353..11f8c37a97ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2076,10 +2076,17 @@ static void
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
>       struct amdgpu_ras_block_object *block_obj =
>               amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     enum ras_event_type type =
> RAS_EVENT_TYPE_POISON_CONSUMPTION;
> +     u64 event_id;
> +     int ret;
>
>       if (!block_obj || !con)
>               return;
>
> +     ret = amdgpu_ras_mark_ras_event(adev, type);
> +     if (ret)

[Tao] add warning? Or you can add it in amdgpu_ras_mark_ras_event.
[Kevin]:

Thanks suggestion, I will update v2 and add a warn log in function internal.

Best Regards,
Kevin

> +             return;
> +
>       /* both query_poison_status and handle_poison_consumption are
> optional,
>        * but at least one of them should be implemented if we need poison
>        * consumption handler
> @@ -2104,8 +2111,10 @@ static void
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
>        * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
>        */
>       if (poison_stat && !con->is_rma) {
> -             dev_info(adev->dev, "GPU reset for %s RAS poison consumption
> is issued!\n",
> -                             block_obj->ras_comm.name);
> +             event_id = amdgpu_ras_acquire_event_id(adev, type);
> +             RAS_EVENT_LOG(adev, event_id,
> +                           "GPU reset for %s RAS poison consumption
> + is
> issued!\n",
> +                           block_obj->ras_comm.name);
>               amdgpu_ras_reset_gpu(adev);
>       }
>
> @@ -2498,7 +2507,7 @@ static enum ras_event_type
> amdgpu_ras_get_recovery_event(struct amdgpu_device *a
>       if (amdgpu_ras_intr_triggered())
>               return RAS_EVENT_TYPE_ISR;
>       else
> -             return RAS_EVENT_TYPE_INVALID;
> +             return RAS_EVENT_TYPE_POISON_CONSUMPTION;
>  }
>
>  static void amdgpu_ras_do_recovery(struct work_struct *work) @@
> -3975,6
> +3984,7 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev,
> enum ras_event_type
>       switch (type) {
>       case RAS_EVENT_TYPE_ISR:
>       case RAS_EVENT_TYPE_POISON_CREATION:
> +     case RAS_EVENT_TYPE_POISON_CONSUMPTION:
>               event_mgr = __get_ras_event_mgr(adev);
>               if (!event_mgr)
>                       return RAS_EVENT_INVALID_ID; diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 1343cfbc913b..6086da67fa4e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -433,6 +433,7 @@ enum ras_event_type {
>       RAS_EVENT_TYPE_INVALID = 0,
>       RAS_EVENT_TYPE_ISR,
>       RAS_EVENT_TYPE_POISON_CREATION,
> +     RAS_EVENT_TYPE_POISON_CONSUMPTION,
>       RAS_EVENT_TYPE_COUNT,
>  };
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index 816800555f7f..8a10a0e42846 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -27,6 +27,7 @@
>  #include "soc15_int.h"
>  #include "kfd_device_queue_manager.h"
>  #include "kfd_smi_events.h"
> +#include "amdgpu_ras.h"
>
>  /*
>   * GFX9 SQ Interrupts
> @@ -144,9 +145,11 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>                               uint16_t pasid, uint16_t client_id)  {
>       enum amdgpu_ras_block block = 0;
> -     int old_poison;
>       uint32_t reset = 0;
>       struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
> +     enum ras_event_type type =
> RAS_EVENT_TYPE_POISON_CONSUMPTION;
> +     u64 event_id;
> +     int old_poison, ret;
>
>       if (!p)
>               return;
> @@ -191,10 +194,16 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>               return;
>       }
>
> +     ret = amdgpu_ras_mark_ras_event(dev->adev, type);

[Tao] I don't think it's necessary to mark it both in amdgpu_ras_interrupt_poison_consumption_handler and here.
Another question is, event_interrupt_poison_consumption_v9 is used by several ASICs, are you sure the event id code doesn't affect old ASICs?

[Kevin]:
The event id is only used for format the RAS event log, and it won't affect the code logic for old Asics.

Best Regards,
Kevin
> +     if (ret)
> +             return;
> +
>       kfd_signal_poison_consumed_event(dev, pasid);
>
> -     dev_warn(dev->adev->dev,
> -              "poison is consumed by client %d, kick off gpu reset flow\n",
> client_id);
> +     event_id = amdgpu_ras_acquire_event_id(dev->adev, type);
> +
> +     RAS_EVENT_LOG(dev->adev, event_id,
> +                   "poison is consumed by client %d, kick off gpu
> +reset flow\n", client_id);
>
>       amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev,
>               block, pasid, NULL, NULL, reset);
> --
> 2.34.1




More information about the amd-gfx mailing list