[PATCH v3] drm/amdgpu: add badpages sysfs interafce

William Lewis minutemaidpark at hotmail.com
Thu May 9 15:45:50 UTC 2019


Typo in the patch description.  s/interafce/interface/

On 5/9/19 10:24 AM, Alex Deucher wrote:
> On Thu, May 9, 2019 at 6:31 AM Pan, Xinhui <Xinhui.Pan at amd.com> wrote:
>> add badpages node.
>> it will output badpages list in format
>> gpu pfn : gpu page size : flags
>>
>> example
>> 0x00000000 : 0x00001000 : R
>> 0x00000001 : 0x00001000 : R
>> 0x00000002 : 0x00001000 : R
>> 0x00000003 : 0x00001000 : R
>> 0x00000004 : 0x00001000 : R
>> 0x00000005 : 0x00001000 : R
>> 0x00000006 : 0x00001000 : R
>> 0x00000007 : 0x00001000 : P
>> 0x00000008 : 0x00001000 : P
>> 0x00000009 : 0x00001000 : P
>>
>> flags can be one of below characters
>> R: reserved.
>> P: pending for reserve.
>> F: failed to reserve for some reasons.
>>
>> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
> Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 146 ++++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |   1 +
>>   2 files changed, 147 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index c60d5f813801..c9e24f60938e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -90,6 +90,12 @@ struct ras_manager {
>>          struct ras_err_data err_data;
>>   };
>>
>> +struct ras_badpage {
>> +       unsigned int bp;
>> +       unsigned int size;
>> +       unsigned int flags;
>> +};
>> +
>>   const char *ras_error_string[] = {
>>          "none",
>>          "parity",
>> @@ -710,6 +716,77 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>>
>>   /* sysfs begin */
>>
>> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
>> +               struct ras_badpage **bps, unsigned int *count);
>> +
>> +static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
>> +{
>> +       switch (flags) {
>> +       case 0:
>> +               return "R";
>> +       case 1:
>> +               return "P";
>> +       case 2:
>> +       default:
>> +               return "F";
>> +       };
>> +}
>> +
>> +/*
>> + * DOC: ras sysfs gpu_vram_bad_pages interface
>> + *
>> + * It allows user to read the bad pages of vram on the gpu through
>> + * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
>> + *
>> + * It outputs multiple lines, and each line stands for one gpu page.
>> + *
>> + * The format of one line is below,
>> + * gpu pfn : gpu page size : flags
>> + *
>> + * gpu pfn and gpu page size are printed in hex format.
>> + * flags can be one of below character,
>> + * R: reserved, this gpu page is reserved and not able to use.
>> + * P: pending for reserve, this gpu page is marked as bad, will be reserved
>> + *    in next window of page_reserve.
>> + * F: unable to reserve. this gpu page can't be reserved due to some reasons.
>> + *
>> + * examples:
>> + * 0x00000001 : 0x00001000 : R
>> + * 0x00000002 : 0x00001000 : P
>> + */
>> +
>> +static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
>> +               struct kobject *kobj, struct bin_attribute *attr,
>> +               char *buf, loff_t ppos, size_t count)
>> +{
>> +       struct amdgpu_ras *con =
>> +               container_of(attr, struct amdgpu_ras, badpages_attr);
>> +       struct amdgpu_device *adev = con->adev;
>> +       const unsigned int element_size =
>> +               sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
>> +       unsigned int start = (ppos + element_size - 1) / element_size;
>> +       unsigned int end = (ppos + count - 1) / element_size;
>> +       ssize_t s = 0;
>> +       struct ras_badpage *bps = NULL;
>> +       unsigned int bps_count = 0;
>> +
>> +       memset(buf, 0, count);
>> +
>> +       if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
>> +               return 0;
>> +
>> +       for (; start < end && start < bps_count; start++)
>> +               s += scnprintf(&buf[s], element_size + 1,
>> +                               "0x%08x : 0x%08x : %1s\n",
>> +                               bps[start].bp,
>> +                               bps[start].size,
>> +                               amdgpu_ras_badpage_flags_str(bps[start].flags));
>> +
>> +       kfree(bps);
>> +
>> +       return s;
>> +}
>> +
>>   static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
>>                  struct device_attribute *attr, char *buf)
>>   {
>> @@ -750,9 +827,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
>>                  &con->features_attr.attr,
>>                  NULL
>>          };
>> +       struct bin_attribute *bin_attrs[] = {
>> +               &con->badpages_attr,
>> +               NULL
>> +       };
>>          struct attribute_group group = {
>>                  .name = "ras",
>>                  .attrs = attrs,
>> +               .bin_attrs = bin_attrs,
>>          };
>>
>>          con->features_attr = (struct device_attribute) {
>> @@ -762,7 +844,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
>>                  },
>>                          .show = amdgpu_ras_sysfs_features_read,
>>          };
>> +
>> +       con->badpages_attr = (struct bin_attribute) {
>> +               .attr = {
>> +                       .name = "gpu_vram_bad_pages",
>> +                       .mode = S_IRUGO,
>> +               },
>> +               .size = 0,
>> +               .private = NULL,
>> +               .read = amdgpu_ras_sysfs_badpages_read,
>> +       };
>> +
>>          sysfs_attr_init(attrs[0]);
>> +       sysfs_bin_attr_init(bin_attrs[0]);
>>
>>          return sysfs_create_group(&adev->dev->kobj, &group);
>>   }
>> @@ -774,9 +868,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
>>                  &con->features_attr.attr,
>>                  NULL
>>          };
>> +       struct bin_attribute *bin_attrs[] = {
>> +               &con->badpages_attr,
>> +               NULL
>> +       };
>>          struct attribute_group group = {
>>                  .name = "ras",
>>                  .attrs = attrs,
>> +               .bin_attrs = bin_attrs,
>>          };
>>
>>          sysfs_remove_group(&adev->dev->kobj, &group);
>> @@ -1108,6 +1207,53 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
>>   /* ih end */
>>
>>   /* recovery begin */
>> +
>> +/* return 0 on success.
>> + * caller need free bps.
>> + */
>> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
>> +               struct ras_badpage **bps, unsigned int *count)
>> +{
>> +       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> +       struct ras_err_handler_data *data;
>> +       int i = 0;
>> +       int ret = 0;
>> +
>> +       if (!con || !con->eh_data || !bps || !count)
>> +               return -EINVAL;
>> +
>> +       mutex_lock(&con->recovery_lock);
>> +       data = con->eh_data;
>> +       if (!data || data->count == 0) {
>> +               *bps = NULL;
>> +               goto out;
>> +       }
>> +
>> +       *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
>> +       if (!*bps) {
>> +               ret = -ENOMEM;
>> +               goto out;
>> +       }
>> +
>> +       for (; i < data->count; i++) {
>> +               (*bps)[i] = (struct ras_badpage){
>> +                       .bp = data->bps[i].bp,
>> +                       .size = AMDGPU_GPU_PAGE_SIZE,
>> +                       .flags = 0,
>> +               };
>> +
>> +               if (data->last_reserved <= i)
>> +                       (*bps)[i].flags = 1;
>> +               else if (data->bps[i].bo == NULL)
>> +                       (*bps)[i].flags = 2;
>> +       }
>> +
>> +       *count = data->count;
>> +out:
>> +       mutex_unlock(&con->recovery_lock);
>> +       return ret;
>> +}
>> +
>>   static void amdgpu_ras_do_recovery(struct work_struct *work)
>>   {
>>          struct amdgpu_ras *ras =
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 065c66baf947..e2dff00b8d1c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -93,6 +93,7 @@ struct amdgpu_ras {
>>          struct dentry *ent;
>>          /* sysfs */
>>          struct device_attribute features_attr;
>> +       struct bin_attribute badpages_attr;
>>          /* block array */
>>          struct ras_manager *objs;
>>
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7C%7Cae69f0d4b0df49f576e008d6d4926c9c%7C84df9e7fe9f640afb435aaaaaaaaaaaa%7C1%7C0%7C636930122677737006&sdata=2kIXvUHQfN4G%2BJ0b7FqX4A1x7qXyTIlv3WMErXUtpgY%3D&reserved=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7C%7Cae69f0d4b0df49f576e008d6d4926c9c%7C84df9e7fe9f640afb435aaaaaaaaaaaa%7C1%7C0%7C636930122677747017&sdata=Evfrt6pS%2B3mEGF7jH6PCzkc0Y8UArseY4GJuhr702Rs%3D&reserved=0


More information about the amd-gfx mailing list