[PATCH v3] drm/amdgpu: add badpages sysfs interafce
William Lewis
minutemaidpark at hotmail.com
Thu May 9 15:45:50 UTC 2019
Typo in the patch description. s/interafce/interface/
On 5/9/19 10:24 AM, Alex Deucher wrote:
> On Thu, May 9, 2019 at 6:31 AM Pan, Xinhui <Xinhui.Pan at amd.com> wrote:
>> add badpages node.
>> it will output badpages list in format
>> gpu pfn : gpu page size : flags
>>
>> example
>> 0x00000000 : 0x00001000 : R
>> 0x00000001 : 0x00001000 : R
>> 0x00000002 : 0x00001000 : R
>> 0x00000003 : 0x00001000 : R
>> 0x00000004 : 0x00001000 : R
>> 0x00000005 : 0x00001000 : R
>> 0x00000006 : 0x00001000 : R
>> 0x00000007 : 0x00001000 : P
>> 0x00000008 : 0x00001000 : P
>> 0x00000009 : 0x00001000 : P
>>
>> flags can be one of below characters
>> R: reserved.
>> P: pending for reserve.
>> F: failed to reserve for some reasons.
>>
>> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
> Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 146 ++++++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
>> 2 files changed, 147 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index c60d5f813801..c9e24f60938e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -90,6 +90,12 @@ struct ras_manager {
>> struct ras_err_data err_data;
>> };
>>
>> +struct ras_badpage {
>> + unsigned int bp;
>> + unsigned int size;
>> + unsigned int flags;
>> +};
>> +
>> const char *ras_error_string[] = {
>> "none",
>> "parity",
>> @@ -710,6 +716,77 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>>
>> /* sysfs begin */
>>
>> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
>> + struct ras_badpage **bps, unsigned int *count);
>> +
>> +static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
>> +{
>> + switch (flags) {
>> + case 0:
>> + return "R";
>> + case 1:
>> + return "P";
>> + case 2:
>> + default:
>> + return "F";
>> + };
>> +}
>> +
>> +/*
>> + * DOC: ras sysfs gpu_vram_bad_pages interface
>> + *
>> + * It allows user to read the bad pages of vram on the gpu through
>> + * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
>> + *
>> + * It outputs multiple lines, and each line stands for one gpu page.
>> + *
>> + * The format of one line is below,
>> + * gpu pfn : gpu page size : flags
>> + *
>> + * gpu pfn and gpu page size are printed in hex format.
>> + * flags can be one of below character,
>> + * R: reserved, this gpu page is reserved and not able to use.
>> + * P: pending for reserve, this gpu page is marked as bad, will be reserved
>> + * in next window of page_reserve.
>> + * F: unable to reserve. this gpu page can't be reserved due to some reasons.
>> + *
>> + * examples:
>> + * 0x00000001 : 0x00001000 : R
>> + * 0x00000002 : 0x00001000 : P
>> + */
>> +
>> +static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
>> + struct kobject *kobj, struct bin_attribute *attr,
>> + char *buf, loff_t ppos, size_t count)
>> +{
>> + struct amdgpu_ras *con =
>> + container_of(attr, struct amdgpu_ras, badpages_attr);
>> + struct amdgpu_device *adev = con->adev;
>> + const unsigned int element_size =
>> + sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
>> + unsigned int start = (ppos + element_size - 1) / element_size;
>> + unsigned int end = (ppos + count - 1) / element_size;
>> + ssize_t s = 0;
>> + struct ras_badpage *bps = NULL;
>> + unsigned int bps_count = 0;
>> +
>> + memset(buf, 0, count);
>> +
>> + if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
>> + return 0;
>> +
>> + for (; start < end && start < bps_count; start++)
>> + s += scnprintf(&buf[s], element_size + 1,
>> + "0x%08x : 0x%08x : %1s\n",
>> + bps[start].bp,
>> + bps[start].size,
>> + amdgpu_ras_badpage_flags_str(bps[start].flags));
>> +
>> + kfree(bps);
>> +
>> + return s;
>> +}
>> +
>> static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
>> struct device_attribute *attr, char *buf)
>> {
>> @@ -750,9 +827,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
>> &con->features_attr.attr,
>> NULL
>> };
>> + struct bin_attribute *bin_attrs[] = {
>> + &con->badpages_attr,
>> + NULL
>> + };
>> struct attribute_group group = {
>> .name = "ras",
>> .attrs = attrs,
>> + .bin_attrs = bin_attrs,
>> };
>>
>> con->features_attr = (struct device_attribute) {
>> @@ -762,7 +844,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
>> },
>> .show = amdgpu_ras_sysfs_features_read,
>> };
>> +
>> + con->badpages_attr = (struct bin_attribute) {
>> + .attr = {
>> + .name = "gpu_vram_bad_pages",
>> + .mode = S_IRUGO,
>> + },
>> + .size = 0,
>> + .private = NULL,
>> + .read = amdgpu_ras_sysfs_badpages_read,
>> + };
>> +
>> sysfs_attr_init(attrs[0]);
>> + sysfs_bin_attr_init(bin_attrs[0]);
>>
>> return sysfs_create_group(&adev->dev->kobj, &group);
>> }
>> @@ -774,9 +868,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
>> &con->features_attr.attr,
>> NULL
>> };
>> + struct bin_attribute *bin_attrs[] = {
>> + &con->badpages_attr,
>> + NULL
>> + };
>> struct attribute_group group = {
>> .name = "ras",
>> .attrs = attrs,
>> + .bin_attrs = bin_attrs,
>> };
>>
>> sysfs_remove_group(&adev->dev->kobj, &group);
>> @@ -1108,6 +1207,53 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
>> /* ih end */
>>
>> /* recovery begin */
>> +
>> +/* return 0 on success.
>> + * caller need free bps.
>> + */
>> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
>> + struct ras_badpage **bps, unsigned int *count)
>> +{
>> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> + struct ras_err_handler_data *data;
>> + int i = 0;
>> + int ret = 0;
>> +
>> + if (!con || !con->eh_data || !bps || !count)
>> + return -EINVAL;
>> +
>> + mutex_lock(&con->recovery_lock);
>> + data = con->eh_data;
>> + if (!data || data->count == 0) {
>> + *bps = NULL;
>> + goto out;
>> + }
>> +
>> + *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
>> + if (!*bps) {
>> + ret = -ENOMEM;
>> + goto out;
>> + }
>> +
>> + for (; i < data->count; i++) {
>> + (*bps)[i] = (struct ras_badpage){
>> + .bp = data->bps[i].bp,
>> + .size = AMDGPU_GPU_PAGE_SIZE,
>> + .flags = 0,
>> + };
>> +
>> + if (data->last_reserved <= i)
>> + (*bps)[i].flags = 1;
>> + else if (data->bps[i].bo == NULL)
>> + (*bps)[i].flags = 2;
>> + }
>> +
>> + *count = data->count;
>> +out:
>> + mutex_unlock(&con->recovery_lock);
>> + return ret;
>> +}
>> +
>> static void amdgpu_ras_do_recovery(struct work_struct *work)
>> {
>> struct amdgpu_ras *ras =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 065c66baf947..e2dff00b8d1c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -93,6 +93,7 @@ struct amdgpu_ras {
>> struct dentry *ent;
>> /* sysfs */
>> struct device_attribute features_attr;
>> + struct bin_attribute badpages_attr;
>> /* block array */
>> struct ras_manager *objs;
>>
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7C%7Cae69f0d4b0df49f576e008d6d4926c9c%7C84df9e7fe9f640afb435aaaaaaaaaaaa%7C1%7C0%7C636930122677737006&sdata=2kIXvUHQfN4G%2BJ0b7FqX4A1x7qXyTIlv3WMErXUtpgY%3D&reserved=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7C%7Cae69f0d4b0df49f576e008d6d4926c9c%7C84df9e7fe9f640afb435aaaaaaaaaaaa%7C1%7C0%7C636930122677747017&sdata=Evfrt6pS%2B3mEGF7jH6PCzkc0Y8UArseY4GJuhr702Rs%3D&reserved=0
More information about the amd-gfx
mailing list