[PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
Chen, Guchun
Guchun.Chen at amd.com
Mon Oct 21 09:19:01 UTC 2019
Thanks Christian.
Let me update commit body in v2.
Regards,
Guchun
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken at gmail.com>
Sent: Monday, October 21, 2019 5:12 PM
To: Chen, Guchun <Guchun.Chen at amd.com>; amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>
Cc: Li, Candice <Candice.Li at amd.com>
Subject: Re: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
Am 21.10.19 um 11:08 schrieb Chen, Guchun:
> Reboot operation for ras recovery is one common debugfs entry, which
> should get rid of ras_ctrl node and remove ip dependence when
> inputting by user. So add one new auto_reboot node in ras debugfs dir
> to achieve this.
We need some justification why this can't be a module parameter instead.
For example write something like we want to control reboot behavior on a per device basis.
Apart from that looks like a nice cleanup to me.
Regards,
Christian.
>
> Signed-off-by: Guchun Chen <guchun.chen at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++++++---
> 1 file changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6220394521e4..3adcd29feb5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
> op = 1;
> else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
> op = 2;
> - else if (sscanf(str, "reboot %32s", block_name) == 1)
> - op = 3;
> else if (str[0] && str[1] && str[2] && str[3])
> /* ascii string, but commands are not matched. */
> return -EINVAL;
> @@ -223,7 +221,6 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
> * - 0: disable RAS on the block. Take ::head as its data.
> * - 1: enable RAS on the block. Take ::head as its data.
> * - 2: inject errors on the block. Take ::inject as its data.
> - * - 3: reboot on unrecoverable error
> *
> * How to use the interface?
> * programs:
> @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
> /* data.inject.address is offset instead of absolute gpu address */
> ret = amdgpu_ras_error_inject(adev, &data.inject);
> break;
> - case 3:
> - amdgpu_ras_get_context(adev)->reboot = true;
> - break;
> default:
> ret = -EINVAL;
> break;
> @@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
> return ret == 1 ? size : -EIO;
> }
>
> +/**
> + * DOC: AMDGPU RAS debugfs auto reboot interface
> + *
> + * After one uncorrectable error happens, GPU recovery will be scheduled.
> + * Due to the known problem in GPU recovery failing to bring GPU
> +back, this
> + * interface provides one direct way to user to reboot system
> +automatically
> + * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU
> +recovery
> + * routine will never be called.
> + *
> + * Enable auto_reboot:
> + *
> + * echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + * Revert auto_reboot:
> + *
> + * echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + */
> +static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
> + const char __user *buf, size_t size, loff_t *pos) {
> + struct amdgpu_device *adev =
> + (struct amdgpu_device *)file_inode(f)->i_private;
> + char tmp[8] = {0};
> + int value = -1;
> +
> + if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
> + return -EINVAL;
> +
> + if (kstrtoint(tmp, 10, &value))
> + return -EINVAL;
> +
> + if (value == 1)
> + amdgpu_ras_get_context(adev)->reboot = true;
> + else if (value == 0)
> + amdgpu_ras_get_context(adev)->reboot = false;
> +
> + return size;
> +}
> +
> static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
> .owner = THIS_MODULE,
> .read = NULL,
> @@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
> .llseek = default_llseek
> };
>
> +static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
> + .owner = THIS_MODULE,
> + .read = NULL,
> + .write = amdgpu_ras_debugfs_reboot_write,
> + .llseek = default_llseek
> +};
> +
> /**
> * DOC: AMDGPU RAS sysfs Error Count Interface
> *
> @@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
> adev, &amdgpu_ras_debugfs_ctrl_ops);
> debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
> adev, &amdgpu_ras_debugfs_eeprom_ops);
> + debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
> + adev, &amdgpu_ras_debugfs_reboot_ops);
> }
>
> void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
More information about the amd-gfx
mailing list