[PATCH 4/4] drm/amdgpu: add ras event state device attribute support
Zhou1, Tao
Tao.Zhou1 at amd.com
Thu Jul 4 06:50:39 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
The series is:
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
> Sent: Wednesday, July 3, 2024 5:03 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>
> Subject: [PATCH 4/4] drm/amdgpu: add ras event state device attribute support
>
> add amdgpu ras 'event_state' sysfs device attribute support
>
> Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 56 +++++++++++++++++++++++-
> - drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 7 +++-
> 2 files changed, 58 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 11f8c37a97ef..d84e4f841ecc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1731,6 +1731,39 @@ static ssize_t
> amdgpu_ras_sysfs_schema_show(struct device *dev,
> return sysfs_emit(buf, "schema: 0x%x\n", con->schema); }
>
> +static struct {
> + enum ras_event_type type;
> + const char *name;
> +} dump_event[] = {
> + {RAS_EVENT_TYPE_ISR, "Fault Error"},
> + {RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"},
> + {RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"}, };
> +
> +static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev,
> + struct device_attribute *attr,
> char *buf) {
> + struct amdgpu_ras *con =
> + container_of(attr, struct amdgpu_ras, event_state_attr);
> + struct ras_event_manager *event_mgr = con->event_mgr;
> + struct ras_event_state *event_state;
> + int i, size = 0;
> +
> + if (!event_mgr)
> + return -EINVAL;
> +
> + size += sysfs_emit_at(buf, size, "current seqno: %llu\n",
> atomic64_read(&event_mgr->seqno));
> + for (i = 0; i < ARRAY_SIZE(dump_event); i++) {
> + event_state = &event_mgr->event_state[dump_event[i].type];
> + size += sysfs_emit_at(buf, size, "%s : count:%llu,
> last_seqno:%llu\n",
> + dump_event[i].name,
> + atomic64_read(&event_state->count),
> + event_state->last_seqno);
> + }
> +
> + return (ssize_t)size;
> +}
> +
> static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device
> *adev) {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1748,6
> +1781,7 @@ static int amdgpu_ras_sysfs_remove_dev_attr_node(struct
> amdgpu_device *adev)
> &con->features_attr.attr,
> &con->version_attr.attr,
> &con->schema_attr.attr,
> + &con->event_state_attr.attr,
> NULL
> };
> struct attribute_group group = {
> @@ -1980,6 +2014,8 @@ static DEVICE_ATTR(version, 0444,
> amdgpu_ras_sysfs_version_show, NULL); static
> DEVICE_ATTR(schema, 0444,
> amdgpu_ras_sysfs_schema_show, NULL);
> +static DEVICE_ATTR(event_state, 0444,
> + amdgpu_ras_sysfs_event_state_show, NULL);
> static int amdgpu_ras_fs_init(struct amdgpu_device *adev) {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1990,6
> +2026,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
> &con->features_attr.attr,
> &con->version_attr.attr,
> &con->schema_attr.attr,
> + &con->event_state_attr.attr,
> NULL
> };
> struct bin_attribute *bin_attrs[] = {
> @@ -2012,6 +2049,10 @@ static int amdgpu_ras_fs_init(struct amdgpu_device
> *adev)
> con->schema_attr = dev_attr_schema;
> sysfs_attr_init(attrs[2]);
>
> + /* add event_state entry */
> + con->event_state_attr = dev_attr_event_state;
> + sysfs_attr_init(attrs[3]);
> +
> if (amdgpu_bad_page_threshold != 0) {
> /* add bad_page_features entry */
> bin_attr_gpu_vram_bad_pages.private = NULL; @@ -3440,13
> +3481,17 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
>
> static void ras_event_mgr_init(struct ras_event_manager *mgr) {
> + struct ras_event_state *event_state;
> int i;
>
> memset(mgr, 0, sizeof(*mgr));
> atomic64_set(&mgr->seqno, 0);
>
> - for (i = 0; i < ARRAY_SIZE(mgr->last_seqno); i++)
> - mgr->last_seqno[i] = RAS_EVENT_INVALID_ID;
> + for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) {
> + event_state = &mgr->event_state[i];
> + event_state->last_seqno = RAS_EVENT_INVALID_ID;
> + atomic64_set(&event_state->count, 0);
> + }
> }
>
> static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) @@ -
> 3960,6 +4005,7 @@ static struct ras_event_manager*
> __get_ras_event_mgr(struct amdgpu_device *adev) int
> amdgpu_ras_mark_ras_event(struct amdgpu_device *adev, enum
> ras_event_type type) {
> struct ras_event_manager *event_mgr;
> + struct ras_event_state *event_state;
>
> if (type >= RAS_EVENT_TYPE_COUNT)
> return -EINVAL;
> @@ -3968,7 +4014,9 @@ int amdgpu_ras_mark_ras_event(struct
> amdgpu_device *adev, enum ras_event_type ty
> if (!event_mgr)
> return -EINVAL;
>
> - event_mgr->last_seqno[type] = atomic64_inc_return(&event_mgr-
> >seqno);
> + event_state = &event_mgr->event_state[type];
> + event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno);
> + atomic64_inc(&event_state->count);
>
> return 0;
> }
> @@ -3989,7 +4037,7 @@ u64 amdgpu_ras_acquire_event_id(struct
> amdgpu_device *adev, enum ras_event_type
> if (!event_mgr)
> return RAS_EVENT_INVALID_ID;
>
> - id = event_mgr->last_seqno[type];
> + id = event_mgr->event_state[type].last_seqno;
> break;
> case RAS_EVENT_TYPE_INVALID:
> default:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 6086da67fa4e..189e2bf53a44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -436,10 +436,14 @@ enum ras_event_type {
> RAS_EVENT_TYPE_POISON_CONSUMPTION,
> RAS_EVENT_TYPE_COUNT,
> };
> +struct ras_event_state {
> + u64 last_seqno;
> + atomic64_t count;
> +};
>
> struct ras_event_manager {
> atomic64_t seqno;
> - u64 last_seqno[RAS_EVENT_TYPE_COUNT];
> + struct ras_event_state event_state[RAS_EVENT_TYPE_COUNT];
> };
>
> struct ras_event_id {
> @@ -493,6 +497,7 @@ struct amdgpu_ras {
> struct device_attribute features_attr;
> struct device_attribute version_attr;
> struct device_attribute schema_attr;
> + struct device_attribute event_state_attr;
> struct bin_attribute badpages_attr;
> struct dentry *de_ras_eeprom_table;
> /* block array */
> --
> 2.34.1
More information about the amd-gfx
mailing list