[PATCH 38/40] drm/amdgpu: RAS EEPROM table is now in debugfs
Luben Tuikov
luben.tuikov at amd.com
Fri Jun 11 18:06:29 UTC 2021
On 2021-06-11 1:51 p.m., Alex Deucher wrote:
> On Fri, Jun 11, 2021 at 1:30 PM Luben Tuikov <luben.tuikov at amd.com> wrote:
>> On 2021-06-11 1:16 p.m., Alex Deucher wrote:
>>> On Tue, Jun 8, 2021 at 5:41 PM Luben Tuikov <luben.tuikov at amd.com> wrote:
>>>> Add "ras_eeprom_size" file in debugfs, which
>>>> reports the maximum size allocated to the RAS
>>>> table in EEROM, as the number of bytes and the
>>>> number of records it could store. For instance,
>>>>
>>>> $cat /sys/kernel/debug/dri/0/ras/ras_eeprom_size
>>>> 262144 bytes or 10921 records
>>>> $_
>>>>
>>>> Add "ras_eeprom_table" file in debugfs, which
>>>> dumps the RAS table stored EEPROM, in a formatted
>>>> way. For instance,
>>>>
>>>> $cat ras_eeprom_table
>>>> Signature Version FirstOffs Size Checksum
>>>> 0x414D4452 0x00010000 0x00000014 0x000000EC 0x000000DA
>>>> Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage
>>>> 0 0x00014 ue 0x00 0x00000000607608DC 0x000000000000 0x00 0x00 0x000000000000
>>>> 1 0x0002C ue 0x00 0x00000000607608DC 0x000000001000 0x00 0x00 0x000000000001
>>>> 2 0x00044 ue 0x00 0x00000000607608DC 0x000000002000 0x00 0x00 0x000000000002
>>>> 3 0x0005C ue 0x00 0x00000000607608DC 0x000000003000 0x00 0x00 0x000000000003
>>>> 4 0x00074 ue 0x00 0x00000000607608DC 0x000000004000 0x00 0x00 0x000000000004
>>>> 5 0x0008C ue 0x00 0x00000000607608DC 0x000000005000 0x00 0x00 0x000000000005
>>>> 6 0x000A4 ue 0x00 0x00000000607608DC 0x000000006000 0x00 0x00 0x000000000006
>>>> 7 0x000BC ue 0x00 0x00000000607608DC 0x000000007000 0x00 0x00 0x000000000007
>>>> 8 0x000D4 ue 0x00 0x00000000607608DD 0x000000008000 0x00 0x00 0x000000000008
>>>> $_
>>>>
>>>> Cc: Alexander Deucher <Alexander.Deucher at amd.com>
>>>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com>
>>>> Cc: John Clements <john.clements at amd.com>
>>>> Cc: Hawking Zhang <Hawking.Zhang at amd.com>
>>>> Cc: Xinhui Pan <xinhui.pan at amd.com>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
>>> Seems like a useful feature. Just a few comments below.
>>>
>>> Alex
>>>
>>>
>>>> ---
>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 +-
>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
>>>> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 241 +++++++++++++++++-
>>>> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 10 +-
>>>> 4 files changed, 252 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>>> index 1424f2cc2076c1..d791a360a92366 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>>> @@ -404,9 +404,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>>>> /* umc ce/ue error injection for a bad page is not allowed */
>>>> if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
>>>> amdgpu_ras_check_bad_page(adev, data.inject.address)) {
>>>> - dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked "
>>>> - "as bad before error injection!\n",
>>>> - data.inject.address);
>>>> + dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
>>>> + "already been marked as bad!\n",
>>>> + data.inject.address);
>>> This seems unrelated to this patch.
>> It's just cosmetic fix, to correctly align, as it seems that the previous alignment was arbitrary.
>> Just pressing TAB in Emacs does wonders. :-)
>>
>> I was in this file and decided to fix this. It's just cosmetic. No functional change.
>>
>>>> break;
>>>> }
>>>>
>>>> @@ -1301,6 +1301,12 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *
>>>> &con->bad_page_cnt_threshold);
>>>> debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
>>>> debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
>>>> + debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
>>>> + &amdgpu_ras_debugfs_eeprom_size_ops);
>>>> + con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
>>>> + S_IRUGO, dir, adev,
>>>> + &amdgpu_ras_debugfs_eeprom_table_ops);
>>>> + amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
>>>>
>>>> /*
>>>> * After one uncorrectable error happens, usually GPU recovery will
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>>> index 256cea5d34f2b6..283afd791db107 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>>> @@ -318,6 +318,7 @@ struct amdgpu_ras {
>>>> /* sysfs */
>>>> struct device_attribute features_attr;
>>>> struct bin_attribute badpages_attr;
>>>> + struct dentry *de_ras_eeprom_table;
>>>> /* block array */
>>>> struct ras_manager *objs;
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
>>>> index dc4a845a32404c..677e379f5fb5e9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
>>>> @@ -27,6 +27,8 @@
>>>> #include <linux/bits.h>
>>>> #include "atom.h"
>>>> #include "amdgpu_eeprom.h"
>>>> +#include <linux/debugfs.h>
>>>> +#include <linux/uaccess.h>
>>>>
>>>> #define EEPROM_I2C_MADDR_VEGA20 0x0
>>>> #define EEPROM_I2C_MADDR_ARCTURUS 0x40000
>>>> @@ -70,6 +72,13 @@
>>>> #define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
>>>> (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)
>>>>
>>>> +/* Given a 0-based relative record index, 0, 1, 2, ..., etc., off
>>>> + * of "fri", return the absolute record index off of the end of
>>>> + * the table header.
>>>> + */
>>>> +#define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
>>>> + (_C)->ras_max_record_count)
>>>> +
>>>> #define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
>>>> RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
>>>>
>>>> @@ -77,13 +86,10 @@
>>>>
>>>> static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
>>>> {
>>>> - if ((adev->asic_type == CHIP_VEGA20) ||
>>>> - (adev->asic_type == CHIP_ARCTURUS) ||
>>>> - (adev->asic_type == CHIP_SIENNA_CICHLID) ||
>>>> - (adev->asic_type == CHIP_ALDEBARAN))
>>>> - return true;
>>>> -
>>>> - return false;
>>>> + return adev->asic_type == CHIP_VEGA20 ||
>>>> + adev->asic_type == CHIP_ARCTURUS ||
>>>> + adev->asic_type == CHIP_SIENNA_CICHLID ||
>>>> + adev->asic_type == CHIP_ALDEBARAN;
>>> Unrelated whitespace change.
>> It's more readable and succinct like this, no?
>>
>> Do you want me to revert these? I mean, they're pleasing to have and change no functionality, and since I was in this file...
>>
> Don't worry about respinning to break these out in this patch, but in
> general it's better to keep formatting cleanups separate from
> functional changes; makes it easier to review the functional changes.
>
> Acked-by: Alex Deucher <alexander.deucher at amd.com>
Okay, great--thanks!
I'll repost the whole set with the squashed patch and the new abort-fixing one for long transfers as soon as the dust settles. (FWIW, it's now available on Gitlab. :-) )
Regards,
Luben
>
> Alex
>
>
>> Regards,
>> Luben
>>
>>>> }
>>>>
>>>> static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
>>>> @@ -258,6 +264,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
>>>> control->ras_num_recs = 0;
>>>> control->ras_fri = 0;
>>>>
>>>> + amdgpu_ras_debugfs_set_ret_size(control);
>>>> +
>>>> mutex_unlock(&control->ras_tbl_mutex);
>>>>
>>>> return res;
>>>> @@ -591,6 +599,8 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
>>>> res = amdgpu_ras_eeprom_append_table(control, record, num);
>>>> if (!res)
>>>> res = amdgpu_ras_eeprom_update_header(control);
>>>> + if (!res)
>>>> + amdgpu_ras_debugfs_set_ret_size(control);
>>>>
>>>> mutex_unlock(&control->ras_tbl_mutex);
>>>> return res;
>>>> @@ -734,6 +744,223 @@ inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
>>>> return RAS_MAX_RECORD_COUNT;
>>>> }
>>>>
>>>> +static ssize_t
>>>> +amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf,
>>>> + size_t size, loff_t *pos)
>>>> +{
>>>> + struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
>>>> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>>>> + struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
>>>> + u8 data[50];
>>>> + int res;
>>>> +
>>>> + if (!size)
>>>> + return size;
>>>> +
>>>> + if (!ras || !control) {
>>>> + res = snprintf(data, sizeof(data), "Not supported\n");
>>>> + } else {
>>>> + res = snprintf(data, sizeof(data), "%d bytes or %d records\n",
>>>> + RAS_TBL_SIZE_BYTES, control->ras_max_record_count);
>>>> + }
>>>> +
>>>> + if (*pos >= res)
>>>> + return 0;
>>>> +
>>>> + res -= *pos;
>>>> + res = min_t(size_t, res, size);
>>>> +
>>>> + if (copy_to_user(buf, &data[*pos], res))
>>>> + return -EINVAL;
>>>> +
>>>> + *pos += res;
>>>> +
>>>> + return res;
>>>> +}
>>>> +
>>>> +const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = {
>>>> + .owner = THIS_MODULE,
>>>> + .read = amdgpu_ras_debugfs_eeprom_size_read,
>>>> + .write = NULL,
>>>> + .llseek = default_llseek,
>>>> +};
>>>> +
>>>> +static const char *tbl_hdr_str = " Signature Version FirstOffs Size Checksum\n";
>>>> +static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n";
>>>> +#define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1)
>>>> +static const char *rec_hdr_str = "Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage\n";
>>>> +static const char *rec_hdr_fmt = "%5d 0x%05X %7s 0x%02X 0x%016llX 0x%012llX 0x%02X 0x%02X 0x%012llX\n";
>>>> +#define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1)
>>>> +
>>>> +static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = {
>>>> + "ignore",
>>>> + "re",
>>>> + "ue",
>>>> +};
>>>> +
>>>> +static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control)
>>>> +{
>>>> + return strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
>>>> + strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs;
>>>> +}
>>>> +
>>>> +void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control)
>>>> +{
>>>> + struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras,
>>>> + eeprom_control);
>>>> + struct dentry *de = ras->de_ras_eeprom_table;
>>>> +
>>>> + if (de)
>>>> + d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control);
>>>> +}
>>>> +
>>>> +static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf,
>>>> + size_t size, loff_t *pos)
>>>> +{
>>>> + struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
>>>> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>>>> + struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
>>>> + const size_t orig_size = size;
>>>> + int res = -EINVAL;
>>>> + size_t data_len;
>>>> +
>>>> + mutex_lock(&control->ras_tbl_mutex);
>>>> +
>>>> + /* We want *pos - data_len > 0, which means there's
>>>> + * bytes to be printed from data.
>>>> + */
>>>> + data_len = strlen(tbl_hdr_str);
>>>> + if (*pos < data_len) {
>>>> + data_len -= *pos;
>>>> + data_len = min_t(size_t, data_len, size);
>>>> + if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len))
>>>> + goto Out;
>>>> + buf += data_len;
>>>> + size -= data_len;
>>>> + *pos += data_len;
>>>> + }
>>>> +
>>>> + data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size;
>>>> + if (*pos < data_len && size > 0) {
>>>> + u8 data[tbl_hdr_fmt_size + 1];
>>>> + loff_t lpos;
>>>> +
>>>> + snprintf(data, sizeof(data), tbl_hdr_fmt,
>>>> + control->tbl_hdr.header,
>>>> + control->tbl_hdr.version,
>>>> + control->tbl_hdr.first_rec_offset,
>>>> + control->tbl_hdr.tbl_size,
>>>> + control->tbl_hdr.checksum);
>>>> +
>>>> + data_len -= *pos;
>>>> + data_len = min_t(size_t, data_len, size);
>>>> + lpos = *pos - strlen(tbl_hdr_str);
>>>> + if (copy_to_user(buf, &data[lpos], data_len))
>>>> + goto Out;
>>>> + buf += data_len;
>>>> + size -= data_len;
>>>> + *pos += data_len;
>>>> + }
>>>> +
>>>> + data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str);
>>>> + if (*pos < data_len && size > 0) {
>>>> + loff_t lpos;
>>>> +
>>>> + data_len -= *pos;
>>>> + data_len = min_t(size_t, data_len, size);
>>>> + lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size;
>>>> + if (copy_to_user(buf, &rec_hdr_str[lpos], data_len))
>>>> + goto Out;
>>>> + buf += data_len;
>>>> + size -= data_len;
>>>> + *pos += data_len;
>>>> + }
>>>> +
>>>> + data_len = amdgpu_ras_debugfs_table_size(control);
>>>> + if (*pos < data_len && size > 0) {
>>>> + u8 dare[RAS_TABLE_RECORD_SIZE];
>>>> + u8 data[rec_hdr_fmt_size + 1];
>>>> + /* Find the starting record index
>>>> + */
>>>> + int s = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
>>>> + strlen(rec_hdr_str)) / rec_hdr_fmt_size;
>>>> + int r = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
>>>> + strlen(rec_hdr_str)) % rec_hdr_fmt_size;
>>>> + struct eeprom_table_record record;
>>>> +
>>>> + for ( ; size > 0 && s < control->ras_num_recs; s++) {
>>>> + u32 ai = RAS_RI_TO_AI(control, s);
>>>> + /* Read a single record
>>>> + */
>>>> + res = __amdgpu_ras_eeprom_read(control, dare, ai, 1);
>>>> + if (res)
>>>> + goto Out;
>>>> + __decode_table_record_from_buf(control, &record, dare);
>>>> + snprintf(data, sizeof(data), rec_hdr_fmt,
>>>> + s,
>>>> + RAS_INDEX_TO_OFFSET(control, ai),
>>>> + record_err_type_str[record.err_type],
>>>> + record.bank,
>>>> + record.ts,
>>>> + record.offset,
>>>> + record.mem_channel,
>>>> + record.mcumc_id,
>>>> + record.retired_page);
>>>> +
>>>> + data_len = min_t(size_t, rec_hdr_fmt_size - r, size);
>>>> + if (copy_to_user(buf, &data[r], data_len))
>>>> + return -EINVAL;
>>>> + buf += data_len;
>>>> + size -= data_len;
>>>> + *pos += data_len;
>>>> + r = 0;
>>>> + }
>>>> + }
>>>> + res = 0;
>>>> +Out:
>>>> + mutex_unlock(&control->ras_tbl_mutex);
>>>> + return res < 0 ? res : orig_size - size;
>>>> +}
>>>> +
>>>> +static ssize_t
>>>> +amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
>>>> + size_t size, loff_t *pos)
>>>> +{
>>>> + struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
>>>> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>>>> + struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
>>>> + u8 data[81];
>>>> + int res;
>>>> +
>>>> + if (!size)
>>>> + return size;
>>>> +
>>>> + if (!ras || !control) {
>>>> + res = snprintf(data, sizeof(data), "Not supported\n");
>>>> + if (*pos >= res)
>>>> + return 0;
>>>> +
>>>> + res -= *pos;
>>>> + res = min_t(size_t, res, size);
>>>> +
>>>> + if (copy_to_user(buf, &data[*pos], res))
>>>> + return -EINVAL;
>>>> +
>>>> + *pos += res;
>>>> +
>>>> + return res;
>>>> + } else {
>>>> + return amdgpu_ras_debugfs_table_read(f, buf, size, pos);
>>>> + }
>>>> +}
>>>> +
>>>> +const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = {
>>>> + .owner = THIS_MODULE,
>>>> + .read = amdgpu_ras_debugfs_eeprom_table_read,
>>>> + .write = NULL,
>>>> + .llseek = default_llseek,
>>>> +};
>>>> +
>>>> /**
>>>> * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum
>>>> * @control: pointer to control structure
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
>>>> index edb0195ea2eb8c..430e08ab3313a2 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
>>>> @@ -29,9 +29,10 @@
>>>> struct amdgpu_device;
>>>>
>>>> enum amdgpu_ras_eeprom_err_type {
>>>> - AMDGPU_RAS_EEPROM_ERR_PLACE_HOLDER,
>>>> + AMDGPU_RAS_EEPROM_ERR_NA,
>>>> AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,
>>>> - AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE
>>>> + AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE,
>>>> + AMDGPU_RAS_EEPROM_ERR_COUNT,
>>>> };
>>>>
>>>> struct amdgpu_ras_eeprom_table_header {
>>>> @@ -121,4 +122,9 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
>>>>
>>>> inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
>>>>
>>>> +void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
>>>> +
>>>> +extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
>>>> +extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
>>>> +
>>>> #endif // _AMDGPU_RAS_EEPROM_H
>>>> --
>>>> 2.32.0
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx at lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7Cluben.tuikov%40amd.com%7C522f2150d3a94b6f713308d92d018213%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637590306808868534%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=1FpDKAaXISnv0NjyVvit4%2FIHxSU7bev%2FpLEYuImOin4%3D&reserved=0
More information about the amd-gfx
mailing list