[Intel-gfx] [PATCH] drm/i915: The return of i915_gpu_info to debugfs

Chris Wilson chris at chris-wilson.co.uk
Tue Jan 31 10:33:03 UTC 2017


Once upon a time before we had automated GPU state capture upon hangs,
we had intel_gpu_dump. Now we come almost full circle and reinstate that
view of the current GPU queues and registers by using the error capture
facility to snapshot the GPU state when debugfs/.../i915_gpu_info is
opened - which should provided useful debugging to both the error
capture routines (without having to cause a hang and avoid the error
state being eaten by igt) and generally.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 123 +++++++++++++++++++---------------
 drivers/gpu/drm/i915/i915_drv.h       |  26 ++++---
 drivers/gpu/drm/i915/i915_gpu_error.c |  71 ++++++++------------
 drivers/gpu/drm/i915/i915_sysfs.c     |  31 +++++----
 4 files changed, 132 insertions(+), 119 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 3ae06568df7b..124a184389a8 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -954,89 +954,103 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
-
-static ssize_t
-i915_error_state_write(struct file *filp,
-		       const char __user *ubuf,
-		       size_t cnt,
-		       loff_t *ppos)
+static ssize_t error_state_read(struct file *file, char __user *ubuf,
+				size_t count, loff_t *pos)
 {
-	struct i915_error_state_file_priv *error_priv = filp->private_data;
-
-	DRM_DEBUG_DRIVER("Resetting error state\n");
-	i915_destroy_error_state(error_priv->i915);
-
-	return cnt;
-}
+	struct drm_i915_error_state *error = file->private_data;
+	struct drm_i915_error_state_buf str;
+	ssize_t ret;
+	loff_t tmp;
 
-static int i915_error_state_open(struct inode *inode, struct file *file)
-{
-	struct drm_i915_private *dev_priv = inode->i_private;
-	struct i915_error_state_file_priv *error_priv;
+	if (!error)
+		return 0;
 
-	error_priv = kzalloc(sizeof(*error_priv), GFP_KERNEL);
-	if (!error_priv)
-		return -ENOMEM;
+	ret = i915_error_state_buf_init(&str, error->i915, count, *pos);
+	if (ret)
+		return ret;
 
-	error_priv->i915 = dev_priv;
+	ret = i915_error_state_to_str(&str, error);
+	if (ret)
+		goto out;
 
-	i915_error_state_get(&dev_priv->drm, error_priv);
+	tmp = 0;
+	ret = simple_read_from_buffer(ubuf, count, &tmp, str.buf, str.bytes);
+	if (ret < 0)
+		goto out;
 
-	file->private_data = error_priv;
+	*pos = str.start + ret;
+out:
+	i915_error_state_buf_release(&str);
+	return ret;
+}
 
+static int error_state_release(struct inode *inode, struct file *file)
+{
+	i915_error_state_put(file->private_data);
 	return 0;
 }
 
-static int i915_error_state_release(struct inode *inode, struct file *file)
+static int i915_gpu_info_open(struct inode *inode, struct file *file)
 {
-	struct i915_error_state_file_priv *error_priv = file->private_data;
+	struct drm_i915_error_state *error;
 
-	i915_error_state_put(error_priv);
-	kfree(error_priv);
+	error = i915_error_state(inode->i_private);
+	if (!error)
+		return -ENOMEM;
 
+	file->private_data = error;
 	return 0;
 }
 
-static ssize_t i915_error_state_read(struct file *file, char __user *userbuf,
-				     size_t count, loff_t *pos)
+static const struct file_operations i915_gpu_info_fops = {
+	.owner = THIS_MODULE,
+	.open = i915_gpu_info_open,
+	.read = error_state_read,
+	.llseek = default_llseek,
+	.release = error_state_release,
+};
+
+static ssize_t
+i915_error_state_write(struct file *filp,
+		       const char __user *ubuf,
+		       size_t cnt,
+		       loff_t *ppos)
 {
-	struct i915_error_state_file_priv *error_priv = file->private_data;
-	struct drm_i915_error_state_buf error_str;
-	loff_t tmp_pos = 0;
-	ssize_t ret_count = 0;
-	int ret;
+	struct drm_i915_error_state *error = filp->private_data;
 
-	ret = i915_error_state_buf_init(&error_str, error_priv->i915,
-					count, *pos);
-	if (ret)
-		return ret;
+	if (!error)
+		return 0;
 
-	ret = i915_error_state_to_str(&error_str, error_priv);
-	if (ret)
-		goto out;
+	DRM_DEBUG_DRIVER("Resetting error state\n");
+	i915_destroy_error_state(error->i915);
 
-	ret_count = simple_read_from_buffer(userbuf, count, &tmp_pos,
-					    error_str.buf,
-					    error_str.bytes);
+	return cnt;
+}
 
-	if (ret_count < 0)
-		ret = ret_count;
-	else
-		*pos = error_str.start + ret_count;
-out:
-	i915_error_state_buf_release(&error_str);
-	return ret ?: ret_count;
+static int i915_error_state_open(struct inode *inode, struct file *file)
+{
+	struct drm_i915_private *i915 = inode->i_private;
+	struct drm_i915_error_state *error;
+
+	spin_lock_irq(&i915->gpu_error.lock);
+	error = i915->gpu_error.first_error;
+	if (error)
+		i915_error_state_get(error);
+	spin_unlock_irq(&i915->gpu_error.lock);
+
+	file->private_data = error;
+
+	return 0;
 }
 
 static const struct file_operations i915_error_state_fops = {
 	.owner = THIS_MODULE,
 	.open = i915_error_state_open,
-	.read = i915_error_state_read,
+	.read = error_state_read,
 	.write = i915_error_state_write,
 	.llseek = default_llseek,
-	.release = i915_error_state_release,
+	.release = error_state_release,
 };
-
 #endif
 
 static int
@@ -4707,6 +4721,7 @@ static const struct i915_debugfs_files {
 	{"i915_gem_drop_caches", &i915_drop_caches_fops},
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 	{"i915_error_state", &i915_error_state_fops},
+	{"i915_gpu_info", &i915_gpu_info_fops},
 #endif
 	{"i915_next_seqno", &i915_next_seqno_fops},
 	{"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 27e74322ca70..cb688918a913 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1504,11 +1504,6 @@ struct drm_i915_error_state_buf {
 	loff_t pos;
 };
 
-struct i915_error_state_file_priv {
-	struct drm_i915_private *i915;
-	struct drm_i915_error_state *error;
-};
-
 #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */
 #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */
 
@@ -3557,7 +3552,7 @@ static inline void intel_display_crc_init(struct drm_i915_private *dev_priv) {}
 __printf(2, 3)
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
-			    const struct i915_error_state_file_priv *error);
+			    const struct drm_i915_error_state *error);
 int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb,
 			      struct drm_i915_private *i915,
 			      size_t count, loff_t pos);
@@ -3566,12 +3561,25 @@ static inline void i915_error_state_buf_release(
 {
 	kfree(eb->buf);
 }
+struct drm_i915_error_state *i915_error_state(struct drm_i915_private *i915);
 void i915_capture_error_state(struct drm_i915_private *dev_priv,
 			      u32 engine_mask,
 			      const char *error_msg);
-void i915_error_state_get(struct drm_device *dev,
-			  struct i915_error_state_file_priv *error_priv);
-void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
+
+static inline struct drm_i915_error_state *
+i915_error_state_get(struct drm_i915_error_state *error)
+{
+	kref_get(&error->ref);
+	return error;
+}
+
+void __i915_error_state_free(struct kref *error_ref);
+static inline void i915_error_state_put(struct drm_i915_error_state *error)
+{
+	if (error)
+		kref_put(&error->ref, __i915_error_state_free);
+}
+
 void i915_destroy_error_state(struct drm_i915_private *dev_priv);
 
 #else
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5283fe815a4d..f8f6dad00d14 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -342,7 +342,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 }
 
 static void error_print_instdone(struct drm_i915_error_state_buf *m,
-				 struct drm_i915_error_engine *ee)
+				 const struct drm_i915_error_engine *ee)
 {
 	int slice;
 	int subslice;
@@ -372,7 +372,7 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
 
 static void error_print_request(struct drm_i915_error_state_buf *m,
 				const char *prefix,
-				struct drm_i915_error_request *erq)
+				const struct drm_i915_error_request *erq)
 {
 	if (!erq->seqno)
 		return;
@@ -386,7 +386,7 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
 
 static void error_print_context(struct drm_i915_error_state_buf *m,
 				const char *header,
-				struct drm_i915_error_context *ctx)
+				const struct drm_i915_error_context *ctx)
 {
 	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
 		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
@@ -394,7 +394,7 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
 }
 
 static void error_print_engine(struct drm_i915_error_state_buf *m,
-			       struct drm_i915_error_engine *ee)
+			       const struct drm_i915_error_engine *ee)
 {
 	err_printf(m, "%s command stream:\n", engine_str(ee->engine_id));
 	err_printf(m, "  START: 0x%08x\n", ee->start);
@@ -547,19 +547,13 @@ static void err_print_capabilities(struct drm_i915_error_state_buf *m,
 }
 
 int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
-			    const struct i915_error_state_file_priv *error_priv)
+			    const struct drm_i915_error_state *error)
 {
-	struct drm_i915_private *dev_priv = error_priv->i915;
+	struct drm_i915_private *dev_priv = error->i915;
 	struct pci_dev *pdev = dev_priv->drm.pdev;
-	struct drm_i915_error_state *error = error_priv->error;
 	struct drm_i915_error_object *obj;
 	int i, j;
 
-	if (!error) {
-		err_printf(m, "no error state collected\n");
-		goto out;
-	}
-
 	err_printf(m, "%s\n", error->error_msg);
 	err_printf(m, "Kernel: " UTS_RELEASE "\n");
 	err_printf(m, "Time: %ld s %ld us\n",
@@ -663,7 +657,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			    error->pinned_bo_count);
 
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		struct drm_i915_error_engine *ee = &error->engine[i];
+		const struct drm_i915_error_engine *ee = &error->engine[i];
 
 		obj = ee->batchbuffer;
 		if (obj) {
@@ -730,7 +724,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	if (error->display)
 		intel_display_print_error_state(m, dev_priv, error->display);
 
-out:
 	if (m->bytes == 0 && m->err)
 		return m->err;
 
@@ -782,7 +775,7 @@ static void i915_error_object_free(struct drm_i915_error_object *obj)
 	kfree(obj);
 }
 
-static void i915_error_state_free(struct kref *error_ref)
+void __i915_error_state_free(struct kref *error_ref)
 {
 	struct drm_i915_error_state *error = container_of(error_ref,
 							  typeof(*error), ref);
@@ -1609,6 +1602,23 @@ static int capture(void *data)
 
 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
 
+struct drm_i915_error_state *
+i915_error_state(struct drm_i915_private *i915)
+{
+	struct drm_i915_error_state *error;
+
+	error = kzalloc(sizeof(*error), GFP_ATOMIC);
+	if (!error)
+		return NULL;
+
+	kref_init(&error->ref);
+	error->i915 = i915;
+
+	stop_machine(capture, error, NULL);
+
+	return error;
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -1632,18 +1642,12 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
 	if (READ_ONCE(dev_priv->gpu_error.first_error))
 		return;
 
-	/* Account for pipe specific data like PIPE*STAT */
-	error = kzalloc(sizeof(*error), GFP_ATOMIC);
+	error = i915_error_state(dev_priv);
 	if (!error) {
 		DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
 		return;
 	}
 
-	kref_init(&error->ref);
-	error->i915 = dev_priv;
-
-	stop_machine(capture, error, NULL);
-
 	i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
 	DRM_INFO("%s\n", error->error_msg);
 
@@ -1657,7 +1661,7 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
 	}
 
 	if (error) {
-		i915_error_state_free(&error->ref);
+		__i915_error_state_free(&error->ref);
 		return;
 	}
 
@@ -1673,24 +1677,6 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
 	}
 }
 
-void i915_error_state_get(struct drm_device *dev,
-			  struct i915_error_state_file_priv *error_priv)
-{
-	struct drm_i915_private *dev_priv = to_i915(dev);
-
-	spin_lock_irq(&dev_priv->gpu_error.lock);
-	error_priv->error = dev_priv->gpu_error.first_error;
-	if (error_priv->error)
-		kref_get(&error_priv->error->ref);
-	spin_unlock_irq(&dev_priv->gpu_error.lock);
-}
-
-void i915_error_state_put(struct i915_error_state_file_priv *error_priv)
-{
-	if (error_priv->error)
-		kref_put(&error_priv->error->ref, i915_error_state_free);
-}
-
 void i915_destroy_error_state(struct drm_i915_private *dev_priv)
 {
 	struct drm_i915_error_state *error;
@@ -1700,6 +1686,5 @@ void i915_destroy_error_state(struct drm_i915_private *dev_priv)
 	dev_priv->gpu_error.first_error = NULL;
 	spin_unlock_irq(&dev_priv->gpu_error.lock);
 
-	if (error)
-		kref_put(&error->ref, i915_error_state_free);
+	i915_error_state_put(error);
 }
diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
index a721ff116101..3f1a825a9f24 100644
--- a/drivers/gpu/drm/i915/i915_sysfs.c
+++ b/drivers/gpu/drm/i915/i915_sysfs.c
@@ -522,30 +522,35 @@ static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
 
 	struct device *kdev = kobj_to_dev(kobj);
 	struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev);
-	struct drm_device *dev = &dev_priv->drm;
-	struct i915_error_state_file_priv error_priv;
 	struct drm_i915_error_state_buf error_str;
-	ssize_t ret_count = 0;
+	struct drm_i915_error_state *error;
+	ssize_t ret_count;
 	int ret;
 
-	memset(&error_priv, 0, sizeof(error_priv));
-
-	ret = i915_error_state_buf_init(&error_str, to_i915(dev), count, off);
+	ret = i915_error_state_buf_init(&error_str, dev_priv, count, off);
 	if (ret)
 		return ret;
 
-	error_priv.i915 = dev_priv;
-	i915_error_state_get(dev, &error_priv);
+	spin_lock_irq(&dev_priv->gpu_error.lock);
+	error = dev_priv->gpu_error.first_error;
+	if (error)
+		i915_error_state_get(error);
+	spin_unlock_irq(&dev_priv->gpu_error.lock);
 
-	ret = i915_error_state_to_str(&error_str, &error_priv);
-	if (ret)
-		goto out;
+	if (error) {
+		ret = i915_error_state_to_str(&error_str, error);
+		if (ret)
+			goto out;
+	} else {
+		error_str.bytes = strlen("No error state collected\n");
+		strcpy(error_str.buf, "No error state collected\n");
+	}
 
 	ret_count = count < error_str.bytes ? count : error_str.bytes;
-
 	memcpy(buf, error_str.buf, ret_count);
+
 out:
-	i915_error_state_put(&error_priv);
+	i915_error_state_put(error);
 	i915_error_state_buf_release(&error_str);
 
 	return ret ?: ret_count;
-- 
2.11.0



More information about the Intel-gfx mailing list