[v3] drm/amdgpu: add badpages sysfs interafce

Nathan Chancellor natechancellor at gmail.com
Fri May 17 07:19:05 UTC 2019


On Thu, May 09, 2019 at 10:31:05AM +0000, Pan, Xinhui wrote:
> add badpages node.
> it will output badpages list in format
> gpu pfn : gpu page size : flags
> 
> example
> 0x00000000 : 0x00001000 : R
> 0x00000001 : 0x00001000 : R
> 0x00000002 : 0x00001000 : R
> 0x00000003 : 0x00001000 : R
> 0x00000004 : 0x00001000 : R
> 0x00000005 : 0x00001000 : R
> 0x00000006 : 0x00001000 : R
> 0x00000007 : 0x00001000 : P
> 0x00000008 : 0x00001000 : P
> 0x00000009 : 0x00001000 : P
> 
> flags can be one of below characters
> R: reserved.
> P: pending for reserve.
> F: failed to reserve for some reasons.
> 
> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
> Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 146 ++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |   1 +
>  2 files changed, 147 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index c60d5f813801..c9e24f60938e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -90,6 +90,12 @@ struct ras_manager {
>  	struct ras_err_data err_data;
>  };
>  
> +struct ras_badpage {
> +	unsigned int bp;
> +	unsigned int size;
> +	unsigned int flags;
> +};
> +
>  const char *ras_error_string[] = {
>  	"none",
>  	"parity",
> @@ -710,6 +716,77 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>  
>  /* sysfs begin */
>  
> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
> +		struct ras_badpage **bps, unsigned int *count);
> +
> +static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
> +{
> +	switch (flags) {
> +	case 0:
> +		return "R";
> +	case 1:
> +		return "P";
> +	case 2:
> +	default:
> +		return "F";
> +	};
> +}
> +
> +/*
> + * DOC: ras sysfs gpu_vram_bad_pages interface
> + *
> + * It allows user to read the bad pages of vram on the gpu through
> + * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
> + *
> + * It outputs multiple lines, and each line stands for one gpu page.
> + *
> + * The format of one line is below,
> + * gpu pfn : gpu page size : flags
> + *
> + * gpu pfn and gpu page size are printed in hex format.
> + * flags can be one of below character,
> + * R: reserved, this gpu page is reserved and not able to use.
> + * P: pending for reserve, this gpu page is marked as bad, will be reserved
> + *    in next window of page_reserve.
> + * F: unable to reserve. this gpu page can't be reserved due to some reasons.
> + *
> + * examples:
> + * 0x00000001 : 0x00001000 : R
> + * 0x00000002 : 0x00001000 : P
> + */
> +
> +static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
> +		struct kobject *kobj, struct bin_attribute *attr,
> +		char *buf, loff_t ppos, size_t count)
> +{
> +	struct amdgpu_ras *con =
> +		container_of(attr, struct amdgpu_ras, badpages_attr);
> +	struct amdgpu_device *adev = con->adev;
> +	const unsigned int element_size =
> +		sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
> +	unsigned int start = (ppos + element_size - 1) / element_size;
> +	unsigned int end = (ppos + count - 1) / element_size;

I believe these two lines cause a link time error with arm32 defconfig +
CONFIG_DRM_AMDGPU (filtered down from allyesconfig):

arm-linux-gnueabi-ld: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.o: in function `amdgpu_ras_sysfs_badpages_read':
amdgpu_ras.c:(.text+0x804): undefined reference to `__aeabi_ldivmod'
arm-linux-gnueabi-ld: amdgpu_ras.c:(.text+0x830): undefined reference to `__aeabi_ldivmod'

The assignments of start and end involve a 64-bit dividend because loff_t
is defined as long long, meaning one of the 64-bit division functions
from include/linux/math64.h should be used. I am not sure of which one
otherwise I would have sent a patch :)

Cheers,
Nathan

> +	ssize_t s = 0;
> +	struct ras_badpage *bps = NULL;
> +	unsigned int bps_count = 0;
> +
> +	memset(buf, 0, count);
> +
> +	if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
> +		return 0;
> +
> +	for (; start < end && start < bps_count; start++)
> +		s += scnprintf(&buf[s], element_size + 1,
> +				"0x%08x : 0x%08x : %1s\n",
> +				bps[start].bp,
> +				bps[start].size,
> +				amdgpu_ras_badpage_flags_str(bps[start].flags));
> +
> +	kfree(bps);
> +
> +	return s;
> +}
> +
>  static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
>  		struct device_attribute *attr, char *buf)
>  {
> @@ -750,9 +827,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
>  		&con->features_attr.attr,
>  		NULL
>  	};
> +	struct bin_attribute *bin_attrs[] = {
> +		&con->badpages_attr,
> +		NULL
> +	};
>  	struct attribute_group group = {
>  		.name = "ras",
>  		.attrs = attrs,
> +		.bin_attrs = bin_attrs,
>  	};
>  
>  	con->features_attr = (struct device_attribute) {
> @@ -762,7 +844,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
>  		},
>  			.show = amdgpu_ras_sysfs_features_read,
>  	};
> +
> +	con->badpages_attr = (struct bin_attribute) {
> +		.attr = {
> +			.name = "gpu_vram_bad_pages",
> +			.mode = S_IRUGO,
> +		},
> +		.size = 0,
> +		.private = NULL,
> +		.read = amdgpu_ras_sysfs_badpages_read,
> +	};
> +
>  	sysfs_attr_init(attrs[0]);
> +	sysfs_bin_attr_init(bin_attrs[0]);
>  
>  	return sysfs_create_group(&adev->dev->kobj, &group);
>  }
> @@ -774,9 +868,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
>  		&con->features_attr.attr,
>  		NULL
>  	};
> +	struct bin_attribute *bin_attrs[] = {
> +		&con->badpages_attr,
> +		NULL
> +	};
>  	struct attribute_group group = {
>  		.name = "ras",
>  		.attrs = attrs,
> +		.bin_attrs = bin_attrs,
>  	};
>  
>  	sysfs_remove_group(&adev->dev->kobj, &group);
> @@ -1108,6 +1207,53 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
>  /* ih end */
>  
>  /* recovery begin */
> +
> +/* return 0 on success.
> + * caller need free bps.
> + */
> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
> +		struct ras_badpage **bps, unsigned int *count)
> +{
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	struct ras_err_handler_data *data;
> +	int i = 0;
> +	int ret = 0;
> +
> +	if (!con || !con->eh_data || !bps || !count)
> +		return -EINVAL;
> +
> +	mutex_lock(&con->recovery_lock);
> +	data = con->eh_data;
> +	if (!data || data->count == 0) {
> +		*bps = NULL;
> +		goto out;
> +	}
> +
> +	*bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
> +	if (!*bps) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	for (; i < data->count; i++) {
> +		(*bps)[i] = (struct ras_badpage){
> +			.bp = data->bps[i].bp,
> +			.size = AMDGPU_GPU_PAGE_SIZE,
> +			.flags = 0,
> +		};
> +
> +		if (data->last_reserved <= i)
> +			(*bps)[i].flags = 1;
> +		else if (data->bps[i].bo == NULL)
> +			(*bps)[i].flags = 2;
> +	}
> +
> +	*count = data->count;
> +out:
> +	mutex_unlock(&con->recovery_lock);
> +	return ret;
> +}
> +
>  static void amdgpu_ras_do_recovery(struct work_struct *work)
>  {
>  	struct amdgpu_ras *ras =
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 065c66baf947..e2dff00b8d1c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -93,6 +93,7 @@ struct amdgpu_ras {
>  	struct dentry *ent;
>  	/* sysfs */
>  	struct device_attribute features_attr;
> +	struct bin_attribute badpages_attr;
>  	/* block array */
>  	struct ras_manager *objs;
>  


More information about the amd-gfx mailing list