[PATCH v2] drm/amdgpu: add badpages sysfs interafce
Alex Deucher
alexdeucher at gmail.com
Wed May 8 19:10:31 UTC 2019
On Tue, May 7, 2019 at 11:15 PM Pan, Xinhui <Xinhui.Pan at amd.com> wrote:
>
> add badpages node.
> it will output badpages list in format
> page : size : flags
gpu pfn : gpu page size : flags
>
> page is PFN.
> flags can be R, P, F.
>
> example
> 0x00000000 : 0x00001000 : R
> 0x00000001 : 0x00001000 : R
> 0x00000002 : 0x00001000 : R
> 0x00000003 : 0x00001000 : R
> 0x00000004 : 0x00001000 : R
> 0x00000005 : 0x00001000 : R
> 0x00000006 : 0x00001000 : R
> 0x00000007 : 0x00001000 : P
> 0x00000008 : 0x00001000 : P
> 0x00000009 : 0x00001000 : P
>
> R: reserved.
> P: pending
> F: failed to reserve for some reason.
>
> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 133 ++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
> 2 files changed, 134 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 22bd21efe6b1..2e9fb785019d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -90,6 +90,12 @@ struct ras_manager {
> struct ras_err_data err_data;
> };
>
> +struct ras_badpage {
> + unsigned int bp;
> + unsigned int size;
> + unsigned int flags;
> +};
> +
> const char *ras_error_string[] = {
> "none",
> "parity",
> @@ -691,6 +697,62 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>
> /* sysfs begin */
>
> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
> + struct ras_badpage **bps, unsigned int *count);
> +
> +static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
> +{
> + switch (flags) {
> + case 0:
> + return "R";
> + case 1:
> + return "P";
> + case 2:
> + default:
> + return "F";
> + };
> +}
> +
> +/*
> + * format: start - end : R|P|F
> + * start, end: page frame number, end is not included.
> + * R: reserved
> + * P: pedning for reserve
pending
> + * F: unable to reserve.
> + */
> +
> +static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
> + struct kobject *kobj, struct bin_attribute *attr,
> + char *buf, loff_t ppos, size_t count)
> +{
> + struct amdgpu_ras *con =
> + container_of(attr, struct amdgpu_ras, badpages_attr);
> + struct amdgpu_device *adev = con->adev;
> + const unsigned int element_size =
> + sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
> + unsigned int start = (ppos + element_size - 1) / element_size;
> + unsigned int end = (ppos + count - 1) / element_size;
> + ssize_t s = 0;
> + struct ras_badpage *bps = NULL;
> + unsigned int bps_count = 0;
> +
> + memset(buf, 0, count);
> +
> + if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
> + return 0;
> +
> + for (; start < end && start < bps_count; start++)
> + s += scnprintf(&buf[s], element_size + 1,
> + "0x%08x : 0x%08x : %1s\n",
> + bps[start].bp,
> + bps[start].size,
> + amdgpu_ras_badpage_flags_str(bps[start].flags));
> +
> + kfree(bps);
> +
> + return s;
> +}
> +
> static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
> struct device_attribute *attr, char *buf)
> {
> @@ -731,9 +793,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
> &con->features_attr.attr,
> NULL
> };
> + struct bin_attribute *bin_attrs[] = {
> + &con->badpages_attr,
> + NULL
> + };
> struct attribute_group group = {
> .name = "ras",
> .attrs = attrs,
> + .bin_attrs = bin_attrs,
> };
>
> con->features_attr = (struct device_attribute) {
> @@ -743,7 +810,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
> },
> .show = amdgpu_ras_sysfs_features_read,
> };
> +
> + con->badpages_attr = (struct bin_attribute) {
> + .attr = {
> + .name = "umc_badpages",
How about "gpu_vram_bad_pages"?
> + .mode = S_IRUGO,
> + },
> + .size = 0,
> + .private = NULL,
> + .read = amdgpu_ras_sysfs_badpages_read,
> + };
> +
> sysfs_attr_init(attrs[0]);
> + sysfs_bin_attr_init(bin_attrs[0]);
>
> return sysfs_create_group(&adev->dev->kobj, &group);
> }
> @@ -755,9 +834,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
> &con->features_attr.attr,
> NULL
> };
> + struct bin_attribute *bin_attrs[] = {
> + &con->badpages_attr,
> + NULL
> + };
> struct attribute_group group = {
> .name = "ras",
> .attrs = attrs,
> + .bin_attrs = bin_attrs,
> };
>
> sysfs_remove_group(&adev->dev->kobj, &group);
> @@ -1089,6 +1173,55 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
> /* ih end */
>
> /* recovery begin */
> +
> +/* return 0 on success.
> + * caller need free bps.
> + */
> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
> + struct ras_badpage **bps, unsigned int *count)
> +{
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_err_handler_data *data;
> + int i = 0;
> + int ret = 0;
> +
> + if (!con || !con->eh_data || !bps || !count)
> + return -EINVAL;
> +
> + mutex_lock(&con->recovery_lock);
> + data = con->eh_data;
> + if (!data || data->count == 0) {
> + *bps = NULL;
> + goto out;
> + }
> +
> + *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
> + if (!*bps) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + /* TODO
> + * We can combine N nearby pages into one entry with size * N.
> + */
> + for (; i < data->count; i++) {
> + (*bps)[i] = (struct ras_badpage){
> + .bp = data->bps[i].bp,
> + .size = AMDGPU_GPU_PAGE_SIZE,
> + .flags = 0,
> + };
> +
> + if (data->last_reserved <= i)
> + (*bps)[i].flags = 1;
> + else if (data->bps[i].bo == NULL)
> + (*bps)[i].flags = 2;
> + }
> +
> + *count = data->count;
> +out:
> + mutex_unlock(&con->recovery_lock);
> + return ret;
> +}
> +
Please add a DOC section describing the sysfs interfaces as well or
update the existing DOC section to cover the sysfs files as well.
Alex
> static void amdgpu_ras_do_recovery(struct work_struct *work)
> {
> struct amdgpu_ras *ras =
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index eaef5edefc34..600f735d9201 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -93,6 +93,7 @@ struct amdgpu_ras {
> struct dentry *ent;
> /* sysfs */
> struct device_attribute features_attr;
> + struct bin_attribute badpages_attr;
> /* block array */
> struct ras_manager *objs;
>
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list