[PATCH 4/4] drm/amdgpu: add ras event state device attribute support

Zhou1, Tao Tao.Zhou1 at amd.com
Thu Jul 4 06:50:39 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

The series is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
> Sent: Wednesday, July 3, 2024 5:03 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>
> Subject: [PATCH 4/4] drm/amdgpu: add ras event state device attribute support
>
> add amdgpu ras 'event_state' sysfs device attribute support
>
> Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 56 +++++++++++++++++++++++-
> -  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  7 +++-
>  2 files changed, 58 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 11f8c37a97ef..d84e4f841ecc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1731,6 +1731,39 @@ static ssize_t
> amdgpu_ras_sysfs_schema_show(struct device *dev,
>       return sysfs_emit(buf, "schema: 0x%x\n", con->schema);  }
>
> +static struct {
> +     enum ras_event_type type;
> +     const char *name;
> +} dump_event[] = {
> +     {RAS_EVENT_TYPE_ISR, "Fault Error"},
> +     {RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"},
> +     {RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"}, };
> +
> +static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev,
> +                                              struct device_attribute *attr,
> char *buf) {
> +     struct amdgpu_ras *con =
> +             container_of(attr, struct amdgpu_ras, event_state_attr);
> +     struct ras_event_manager *event_mgr = con->event_mgr;
> +     struct ras_event_state *event_state;
> +     int i, size = 0;
> +
> +     if (!event_mgr)
> +             return -EINVAL;
> +
> +     size += sysfs_emit_at(buf, size, "current seqno: %llu\n",
> atomic64_read(&event_mgr->seqno));
> +     for (i = 0; i <  ARRAY_SIZE(dump_event); i++) {
> +             event_state = &event_mgr->event_state[dump_event[i].type];
> +             size += sysfs_emit_at(buf, size, "%s : count:%llu,
> last_seqno:%llu\n",
> +                                   dump_event[i].name,
> +                                   atomic64_read(&event_state->count),
> +                                   event_state->last_seqno);
> +     }
> +
> +     return (ssize_t)size;
> +}
> +
>  static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device
> *adev)  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1748,6
> +1781,7 @@ static int amdgpu_ras_sysfs_remove_dev_attr_node(struct
> amdgpu_device *adev)
>               &con->features_attr.attr,
>               &con->version_attr.attr,
>               &con->schema_attr.attr,
> +             &con->event_state_attr.attr,
>               NULL
>       };
>       struct attribute_group group = {
> @@ -1980,6 +2014,8 @@ static DEVICE_ATTR(version, 0444,
>               amdgpu_ras_sysfs_version_show, NULL);  static
> DEVICE_ATTR(schema, 0444,
>               amdgpu_ras_sysfs_schema_show, NULL);
> +static DEVICE_ATTR(event_state, 0444,
> +                amdgpu_ras_sysfs_event_state_show, NULL);
>  static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1990,6
> +2026,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
>               &con->features_attr.attr,
>               &con->version_attr.attr,
>               &con->schema_attr.attr,
> +             &con->event_state_attr.attr,
>               NULL
>       };
>       struct bin_attribute *bin_attrs[] = {
> @@ -2012,6 +2049,10 @@ static int amdgpu_ras_fs_init(struct amdgpu_device
> *adev)
>       con->schema_attr = dev_attr_schema;
>       sysfs_attr_init(attrs[2]);
>
> +     /* add event_state entry */
> +     con->event_state_attr = dev_attr_event_state;
> +     sysfs_attr_init(attrs[3]);
> +
>       if (amdgpu_bad_page_threshold != 0) {
>               /* add bad_page_features entry */
>               bin_attr_gpu_vram_bad_pages.private = NULL; @@ -3440,13
> +3481,17 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
>
>  static void ras_event_mgr_init(struct ras_event_manager *mgr)  {
> +     struct ras_event_state *event_state;
>       int i;
>
>       memset(mgr, 0, sizeof(*mgr));
>       atomic64_set(&mgr->seqno, 0);
>
> -     for (i = 0; i < ARRAY_SIZE(mgr->last_seqno); i++)
> -             mgr->last_seqno[i] = RAS_EVENT_INVALID_ID;
> +     for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) {
> +             event_state = &mgr->event_state[i];
> +             event_state->last_seqno = RAS_EVENT_INVALID_ID;
> +             atomic64_set(&event_state->count, 0);
> +     }
>  }
>
>  static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) @@ -
> 3960,6 +4005,7 @@ static struct ras_event_manager*
> __get_ras_event_mgr(struct amdgpu_device *adev)  int
> amdgpu_ras_mark_ras_event(struct amdgpu_device *adev, enum
> ras_event_type type)  {
>       struct ras_event_manager *event_mgr;
> +     struct ras_event_state *event_state;
>
>       if (type >= RAS_EVENT_TYPE_COUNT)
>               return -EINVAL;
> @@ -3968,7 +4014,9 @@ int amdgpu_ras_mark_ras_event(struct
> amdgpu_device *adev, enum ras_event_type ty
>       if (!event_mgr)
>               return -EINVAL;
>
> -     event_mgr->last_seqno[type] = atomic64_inc_return(&event_mgr-
> >seqno);
> +     event_state = &event_mgr->event_state[type];
> +     event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno);
> +     atomic64_inc(&event_state->count);
>
>       return 0;
>  }
> @@ -3989,7 +4037,7 @@ u64 amdgpu_ras_acquire_event_id(struct
> amdgpu_device *adev, enum ras_event_type
>               if (!event_mgr)
>                       return RAS_EVENT_INVALID_ID;
>
> -             id = event_mgr->last_seqno[type];
> +             id = event_mgr->event_state[type].last_seqno;
>               break;
>       case RAS_EVENT_TYPE_INVALID:
>       default:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 6086da67fa4e..189e2bf53a44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -436,10 +436,14 @@ enum ras_event_type {
>       RAS_EVENT_TYPE_POISON_CONSUMPTION,
>       RAS_EVENT_TYPE_COUNT,
>  };
> +struct ras_event_state {
> +     u64 last_seqno;
> +     atomic64_t count;
> +};
>
>  struct ras_event_manager {
>       atomic64_t seqno;
> -     u64 last_seqno[RAS_EVENT_TYPE_COUNT];
> +     struct ras_event_state event_state[RAS_EVENT_TYPE_COUNT];
>  };
>
>  struct ras_event_id {
> @@ -493,6 +497,7 @@ struct amdgpu_ras {
>       struct device_attribute features_attr;
>       struct device_attribute version_attr;
>       struct device_attribute schema_attr;
> +     struct device_attribute event_state_attr;
>       struct bin_attribute badpages_attr;
>       struct dentry *de_ras_eeprom_table;
>       /* block array */
> --
> 2.34.1



More information about the amd-gfx mailing list