[PATCH] drm/amdgpu: refine reboot debugfs operation in ras case

Mon Oct 21 09:33:05 UTC 2019

Ras reboot debugfs node allows user one easy control to avoid
gpu recovery hang problem and directly reboot system per card
basis, after ras uncorrectable error happens. However, it is
one common entry, which should get rid of ras_ctrl node and
remove ip dependence when inputting by user. So add one new
auto_reboot node in ras debugfs dir to achieve this.

v2: in commit mssage, add justification why ras reboot debugfs
node is needed.

Signed-off-by: Guchun Chen <guchun.chen at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57 ++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6220394521e4..6450065b88de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 		op = 1;
 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 		op = 2;
-	else if (sscanf(str, "reboot %32s", block_name) == 1)
-		op = 3;
 	else if (str[0] && str[1] && str[2] && str[3])
 		/* ascii string, but commands are not matched. */
 		return -EINVAL;
@@ -218,12 +216,11 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  * value to the address.
  *
  * Second member: struct ras_debug_if::op.
- * It has four kinds of operations.
+ * It has three kinds of operations.
  *
  * - 0: disable RAS on the block. Take ::head as its data.
  * - 1: enable RAS on the block. Take ::head as its data.
  * - 2: inject errors on the block. Take ::inject as its data.
- * - 3: reboot on unrecoverable error
  *
  * How to use the interface?
  * programs:
@@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
 		break;
-	case 3:
-		amdgpu_ras_get_context(adev)->reboot = true;
-		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
 	return ret == 1 ? size : -EIO;
 }
 
+/**
+ * DOC: AMDGPU RAS debugfs auto reboot interface
+ *
+ * After one uncorrectable error happens, GPU recovery will be scheduled.
+ * Due to the known problem in GPU recovery failing to bring GPU back, this
+ * interface provides one direct way to user to reboot system automatically
+ * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery
+ * routine will never be called.
+ *
+ * Enable auto_reboot:
+ *
+ *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ * Revert auto_reboot:
+ *
+ * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ */
+static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
+	const char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev =
+		(struct amdgpu_device *)file_inode(f)->i_private;
+	char tmp[8] = {0};
+	int value = -1;
+
+	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
+		return -EINVAL;
+
+	if (kstrtoint(tmp, 10, &value))
+		return -EINVAL;
+
+	if (value == 1)
+		amdgpu_ras_get_context(adev)->reboot = true;
+	else if (value == 0)
+		amdgpu_ras_get_context(adev)->reboot = false;
+
+	return size;
+}
+
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 	.owner = THIS_MODULE,
 	.read = NULL,
@@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
 	.llseek = default_llseek
 };
 
+static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
+	.owner = THIS_MODULE,
+	.read = NULL,
+	.write = amdgpu_ras_debugfs_reboot_write,
+	.llseek = default_llseek
+};
+
 /**
  * DOC: AMDGPU RAS sysfs Error Count Interface
  *
@@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 				adev, &amdgpu_ras_debugfs_ctrl_ops);
 	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
 				adev, &amdgpu_ras_debugfs_eeprom_ops);
+	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
+				adev, &amdgpu_ras_debugfs_reboot_ops);
 }
 
 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
-- 
2.17.1