[PATCH] drm/amdgpu: add amdgpu_timeout_ring_* file to debugfs

Nicolai Hähnle nicolai.haehnle at amd.com
Wed Jun 14 11:27:58 UTC 2023


Report the per-ring timeout in milliseconds and allow users to adjust
the timeout dynamically. This can be useful for debugging, e.g. to more
easily test whether a submission genuinely hangs or is just taking very
long, and to temporarily disable GPU recovery so that shader problems
can be examined in detail, including single-stepping through shader
code.

It feels a bit questionable to access ring->sched.timeout without any
locking -- under a C++ memory model it would technically be undefined
behavior. But it's not like a lot can go wrong here in practice, and
it's not clear to me what locking or atomics, if any, should be used.

Signed-off-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 32 +++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index dc474b809604..32d223daa789 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -471,35 +471,65 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf,
 
 	return result;
 }
 
 static const struct file_operations amdgpu_debugfs_ring_fops = {
 	.owner = THIS_MODULE,
 	.read = amdgpu_debugfs_ring_read,
 	.llseek = default_llseek
 };
 
+static int amdgpu_debugfs_timeout_ring_get(void *data, u64 *val) {
+	struct amdgpu_ring *ring = data;
+
+	if (ring->sched.timeout == MAX_SCHEDULE_TIMEOUT)
+		*val = 0;
+	else
+		*val = jiffies_to_msecs(ring->sched.timeout);
+
+	return 0;
+}
+
+static int amdgpu_debugfs_timeout_ring_set(void *data, u64 val) {
+	struct amdgpu_ring *ring = data;
+
+	if (val == 0)
+		ring->sched.timeout = MAX_SCHEDULE_TIMEOUT;
+	else
+		ring->sched.timeout = msecs_to_jiffies(val);
+
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_timeout_ring_fops,
+			 amdgpu_debugfs_timeout_ring_get,
+			 amdgpu_debugfs_timeout_ring_set,
+			 "%llu\n");
+
 #endif
 
 void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
 			      struct amdgpu_ring *ring)
 {
 #if defined(CONFIG_DEBUG_FS)
 	struct drm_minor *minor = adev_to_drm(adev)->primary;
 	struct dentry *root = minor->debugfs_root;
-	char name[32];
+	char name[40];
 
 	sprintf(name, "amdgpu_ring_%s", ring->name);
 	debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
 				 &amdgpu_debugfs_ring_fops,
 				 ring->ring_size + 12);
 
+	sprintf(name, "amdgpu_timeout_ring_%s", ring->name);
+	debugfs_create_file(name, S_IFREG | S_IRUGO | S_IWUSR, root, ring,
+			    &amdgpu_debugfs_timeout_ring_fops);
 #endif
 }
 
 /**
  * amdgpu_ring_test_helper - tests ring and set sched readiness status
  *
  * @ring: ring to try the recovery on
  *
  * Tests ring and set sched readiness status
  *
-- 
2.40.0



More information about the amd-gfx mailing list