[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v3
Zhao, Jiange
Jiange.Zhao at amd.com
Sun Apr 26 10:16:07 UTC 2020
[AMD Official Use Only - Internal Distribution Only]
Hi @Christian König<mailto:ckoenig.leichtzumerken at gmail.com>,
I pulled your patch and tried it. And I have some doubts:
(1) I can't fully understand the functionality of read() in here.
(2) amdgpu_autodump can't be reopened. Because wait_for_completion_interruptible_timeout() would decrement autodump.dumping.done.
(3) When there is no usermode app listening on this node, amdgpu_debugfs_wait_dump() would stuck and reset can't be performed.
So I made some changes on top of your modification as version 4. Please have a look.
Jiange
________________________________
From: Christian König <ckoenig.leichtzumerken at gmail.com>
Sent: Friday, April 24, 2020 7:47 PM
To: amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Liu, Monk <Monk.Liu at amd.com>; Zhao, Jiange <Jiange.Zhao at amd.com>
Cc: Kuehling, Felix <Felix.Kuehling at amd.com>
Subject: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v3
From: Jiange Zhao <Jiange.Zhao at amd.com>
When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.
A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.
After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().
There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.
v2: (1) changed 'registered' to 'app_listening'
(2) add a mutex in open() to prevent race condition
v3 (chk): grab the reset lock to avoid race in autodump_open,
rename debugfs file to amdgpu_autodump,
provide autodump_read as well,
style and code cleanups
Signed-off-by: Jiange Zhao <Jiange.Zhao at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 92 ++++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 6 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +
4 files changed, 101 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..6f8ef98c4b97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -990,6 +990,8 @@ struct amdgpu_device {
char product_number[16];
char product_name[32];
char serial[16];
+
+ struct amdgpu_autodump autodump;
};
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..b1029d12a971 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
#include <linux/pci.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
-
+#include <linux/poll.h>
#include <drm/drm_debugfs.h>
#include "amdgpu.h"
@@ -74,8 +74,96 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
}
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+ const unsigned long timeout = 600 * HZ;
+ int ret;
+
+ wake_up_interruptible(&adev->autodump.gpu_hang);
+
+ ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping,
+ timeout);
+ if (ret == 0) {
+ pr_err("autodump: timeout, move on to gpu recovery\n");
+ return -ETIMEDOUT;
+ }
+#endif
+ return 0;
+}
+
#if defined(CONFIG_DEBUG_FS)
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+ struct amdgpu_device *adev = inode->i_private;
+ int ret;
+
+ file->private_data = adev;
+
+ mutex_lock(&adev->lock_reset);
+ if (adev->autodump.dumping.done) {
+ reinit_completion(&adev->autodump.dumping);
+ ret = 0;
+ } else {
+ ret = -EBUSY;
+ }
+ mutex_unlock(&adev->lock_reset);
+
+ return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode,
+ struct file *file)
+{
+ struct amdgpu_device *adev = file->private_data;
+
+ complete(&adev->autodump.dumping);
+ return 0;
+}
+
+static ssize_t amdgpu_debugfs_autodump_read(struct file *file, char __user *buf,
+ size_t size, loff_t *pos)
+{
+ struct amdgpu_device *adev = file->private_data;
+
+ wait_event_interruptible(adev->autodump.gpu_hang, adev->in_gpu_reset);
+ return 0;
+}
+
+unsigned int amdgpu_debugfs_autodump_poll(struct file *file,
+ struct poll_table_struct *poll_table)
+{
+ struct amdgpu_device *adev = file->private_data;
+
+ poll_wait(file, &adev->autodump.gpu_hang, poll_table);
+
+ if (adev->in_gpu_reset)
+ return POLLIN | POLLRDNORM | POLLWRNORM;
+
+ return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+ .owner = THIS_MODULE,
+ .open = amdgpu_debugfs_autodump_open,
+ .read = amdgpu_debugfs_autodump_read,
+ .poll = amdgpu_debugfs_autodump_poll,
+ .release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+ init_completion(&adev->autodump.dumping);
+ init_waitqueue_head(&adev->autodump.gpu_hang);
+
+ complete(&adev->autodump.dumping);
+
+ debugfs_create_file("amdgpu_autodump", 0600,
+ adev->ddev->primary->debugfs_root,
+ adev, &autodump_debug_fops);
+}
+
/**
* amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
*
@@ -1434,6 +1522,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
amdgpu_ras_debugfs_create_all(adev);
+ amdgpu_debugfs_autodump_init(adev);
+
return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
ARRAY_SIZE(amdgpu_debugfs_list));
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index de12d1101526..3504c48e1564 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -31,6 +31,11 @@ struct amdgpu_debugfs {
unsigned num_files;
};
+struct amdgpu_autodump {
+ struct completion dumping;
+ struct wait_queue_head gpu_hang;
+};
+
int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
int amdgpu_debugfs_init(struct amdgpu_device *adev);
void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3d601d5dd5af..44e54ea7af0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3915,6 +3915,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
int i, r = 0;
bool need_full_reset = *need_full_reset_arg;
+ amdgpu_debugfs_wait_dump(adev);
+
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
--
2.17.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20200426/a8614f47/attachment-0001.htm>
More information about the amd-gfx
mailing list