[PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
Chen, Guchun
Guchun.Chen at amd.com
Mon Oct 21 09:08:06 UTC 2019
Reboot operation for ras recovery is one common debugfs
entry, which should get rid of ras_ctrl node and remove
ip dependence when inputting by user. So add one new
auto_reboot node in ras debugfs dir to achieve this.
Signed-off-by: Guchun Chen <guchun.chen at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++++++---
1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6220394521e4..3adcd29feb5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
- else if (sscanf(str, "reboot %32s", block_name) == 1)
- op = 3;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;
@@ -223,7 +221,6 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
* - 0: disable RAS on the block. Take ::head as its data.
* - 1: enable RAS on the block. Take ::head as its data.
* - 2: inject errors on the block. Take ::inject as its data.
- * - 3: reboot on unrecoverable error
*
* How to use the interface?
* programs:
@@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
/* data.inject.address is offset instead of absolute gpu address */
ret = amdgpu_ras_error_inject(adev, &data.inject);
break;
- case 3:
- amdgpu_ras_get_context(adev)->reboot = true;
- break;
default:
ret = -EINVAL;
break;
@@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
return ret == 1 ? size : -EIO;
}
+/**
+ * DOC: AMDGPU RAS debugfs auto reboot interface
+ *
+ * After one uncorrectable error happens, GPU recovery will be scheduled.
+ * Due to the known problem in GPU recovery failing to bring GPU back, this
+ * interface provides one direct way to user to reboot system automatically
+ * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery
+ * routine will never be called.
+ *
+ * Enable auto_reboot:
+ *
+ * echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ * Revert auto_reboot:
+ *
+ * echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ */
+static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
+ const char __user *buf, size_t size, loff_t *pos)
+{
+ struct amdgpu_device *adev =
+ (struct amdgpu_device *)file_inode(f)->i_private;
+ char tmp[8] = {0};
+ int value = -1;
+
+ if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
+ return -EINVAL;
+
+ if (kstrtoint(tmp, 10, &value))
+ return -EINVAL;
+
+ if (value == 1)
+ amdgpu_ras_get_context(adev)->reboot = true;
+ else if (value == 0)
+ amdgpu_ras_get_context(adev)->reboot = false;
+
+ return size;
+}
+
static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
.owner = THIS_MODULE,
.read = NULL,
@@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
.llseek = default_llseek
};
+static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
+ .owner = THIS_MODULE,
+ .read = NULL,
+ .write = amdgpu_ras_debugfs_reboot_write,
+ .llseek = default_llseek
+};
+
/**
* DOC: AMDGPU RAS sysfs Error Count Interface
*
@@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
adev, &amdgpu_ras_debugfs_ctrl_ops);
debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
adev, &amdgpu_ras_debugfs_eeprom_ops);
+ debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
+ adev, &amdgpu_ras_debugfs_reboot_ops);
}
void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
--
2.17.1
More information about the amd-gfx
mailing list