[Intel-gfx] [RFCv2 09/12] drm/i915: TDR/watchdog trace points.

Tue Jul 21 06:58:52 PDT 2015

Defined trace points and sprinkled the usage of these throughout the
TDR/watchdog implementation.

The following trace points are supported:

	1. trace_i915_tdr_gpu_recovery:
	Called at the onset of the full GPU reset recovery path.

	2. trace_i915_tdr_engine_recovery:
	Called at the onset of the per-engine recovery path.

	3. i915_tdr_recovery_start:
	Called at the onset of hang recovery before recovery mode has been
	decided.

	4. i915_tdr_recovery_complete:
	Called at the point of hang recovery completion.

	5. i915_tdr_recovery_queued:
	Called once the error handler decides to schedule the actual hang
	recovery, which marks the end of the hang detection path.

	6. i915_tdr_engine_save:
	Called at the point of saving the engine state during per-engine hang
	recovery.

	7. i915_tdr_gpu_reset_complete:
	Called at the point of full GPU reset recovery completion.

	8. i915_tdr_engine_reset_complete:
	Called at the point of per-engine recovery completion.

	9. i915_tdr_forced_csb_check:
	Called at the completion of a forced CSB check.

	10. i915_tdr_hang_check:
	Called for every engine in the periodic hang checker loop before moving
	on to the next engine. Provides an overview of all hang check stats in
	real-time. The collected stats are:

		a. Engine name.

		b. Current engine seqno.

		c. Seqno of previous hang check iteration for that engine.

		d. ACTHD register value of given engine.

		e. Current hang check score of given engine (and whether or not
		the engine has been detected as hung).

		f. Current action for given engine.

		g. Busyness of given engine.

		h. Submission status of currently running context on given engine.

* v2:
As a consequence of the faking context event interrupt commit being moved from
the hang checker to the per-engine recovery path we no longer check context
submission status from the hang checker. Therefore there is no need to provide
submission status of the currently running context to the
trace_i915_tdr_hang_check() event.

Signed-off-by: Tomas Elf <tomas.elf at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c       |   3 +
 drivers/gpu/drm/i915/i915_drv.h       |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.c |   2 +-
 drivers/gpu/drm/i915/i915_irq.c       |   9 +-
 drivers/gpu/drm/i915/i915_trace.h     | 293 ++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c      |   8 +-
 drivers/gpu/drm/i915/intel_uncore.c   |   4 +
 7 files changed, 316 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 2ccb2e8..f923036 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -845,6 +845,7 @@ int i915_reset(struct drm_device *dev)
 	if (!i915.reset)
 		return 0;
 
+	trace_i915_tdr_gpu_recovery(dev);
 	intel_reset_gt_powersave(dev);
 
 	mutex_lock(&dev->struct_mutex);
@@ -1017,6 +1018,8 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
+	trace_i915_tdr_engine_recovery(engine);
+
         /* Take wake lock to prevent power saving mode */
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3d31872..9f37cba 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3129,6 +3129,7 @@ void i915_destroy_error_state(struct drm_device *dev);
 
 void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone);
 const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a);
 
 /* i915_cmd_parser.c */
 int i915_cmd_parser_get_version(void);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ac22614..cee1825 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -220,7 +220,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	}
 }
 
-static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
 {
 	switch (a) {
 	case HANGCHECK_IDLE:
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 22dd96c..beee624 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2359,6 +2359,7 @@ static void i915_error_work_func(struct work_struct *work)
 
 	mutex_lock(&dev->struct_mutex);
 
+	trace_i915_tdr_recovery_start(dev);
 	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
 
 	for_each_ring(ring, dev_priv, i) {
@@ -2477,6 +2478,7 @@ static void i915_error_work_func(struct work_struct *work)
 			kobject_uevent_env(&dev->primary->kdev->kobj,
 					   KOBJ_CHANGE, reset_done_event);
 	}
+	trace_i915_tdr_recovery_complete(dev);
 }
 
 static void i915_report_and_clear_eir(struct drm_device *dev)
@@ -2607,6 +2609,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	va_list args;
 	char error_msg[80];
+	bool full_reset = true;
 
 	struct intel_engine_cs *engine;
 
@@ -2624,7 +2627,6 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 		 *	2. The hardware does not support it (pre-gen7).
 		 *	3. We already tried per-engine reset recently.
 		 */
-		bool full_reset = true;
 
 		/*
 		 * TBD: We currently only support per-engine reset for gen8+.
@@ -2648,6 +2650,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 						i915.gpu_reset_promotion_time;
 
 					engine->hangcheck.last_engine_reset_time = now;
+
 				} else {
 					/*
 					 * Watchdog timeout always results
@@ -2696,6 +2699,8 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 		i915_error_wake_up(dev_priv, false);
 	}
 
+	trace_i915_tdr_recovery_queued(dev, engine_mask, watchdog, full_reset);
+
 	/*
 	 * Gen 7:
 	 *
@@ -3146,6 +3151,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		ring->hangcheck.seqno = seqno;
 		ring->hangcheck.acthd = acthd;
 		busy_count += busy;
+
+		trace_i915_tdr_hang_check(ring, seqno, acthd, busy);
 	}
 
 	for_each_ring(ring, dev_priv, i) {
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 2aa140e..b970259 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -775,6 +775,299 @@ TRACE_EVENT(switch_mm,
 		  __entry->dev, __entry->ring, __entry->to, __entry->vm)
 );
 
+/**
+ * DOC: i915_tdr_gpu_recovery
+ *
+ * This tracepoint tracks the onset of the full GPU recovery path
+ */
+TRACE_EVENT(i915_tdr_gpu_recovery,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+	),
+
+	TP_printk("dev=%u, full GPU recovery started",
+		  __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_engine_recovery
+ *
+ * This tracepoint tracks the onset of the engine recovery path
+ */
+TRACE_EVENT(i915_tdr_engine_recovery,
+	TP_PROTO(struct intel_engine_cs *engine),
+
+	TP_ARGS(engine),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+	),
+
+	TP_printk("dev=%u, engine=%u, recovery of %s started",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->id,
+		  __entry->engine->name)
+);
+
+/**
+ * DOC: i915_tdr_recovery_start
+ *
+ * This tracepoint tracks hang recovery start
+ */
+TRACE_EVENT(i915_tdr_recovery_start,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+	),
+
+	TP_printk("dev=%u, hang recovery started",
+		  __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_complete
+ *
+ * This tracepoint tracks hang recovery completion
+ */
+TRACE_EVENT(i915_tdr_recovery_complete,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+	),
+
+	TP_printk("dev=%u, hang recovery completed",
+		  __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_queued
+ *
+ * This tracepoint tracks the point of queuing recovery from hang check.
+ * If engine recovery is requested engine name will be displayed, otherwise
+ * it will be set to "none". If too many engine reset was attempted in the
+ * previous history we promote to full GPU reset, which is remarked by appending
+ * the "[PROMOTED]" flag.
+ */
+TRACE_EVENT(i915_tdr_recovery_queued,
+	TP_PROTO(struct drm_device *dev,
+		 u32 hung_engines,
+		 bool watchdog,
+		 bool full_reset),
+
+	TP_ARGS(dev, hung_engines, watchdog, full_reset),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+			__field(u32, hung_engines)
+			__field(bool, watchdog)
+			__field(bool, full_reset)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+			__entry->hung_engines = hung_engines;
+			__entry->watchdog = watchdog;
+			__entry->full_reset = full_reset;
+	),
+
+	TP_printk("dev=%u, hung_engines=0x%02x%s%s%s%s%s%s%s, watchdog=%s, full_reset=%s",
+		  __entry->dev,
+		  __entry->hung_engines,
+		  __entry->hung_engines ? " (":"",
+		  __entry->hung_engines & RENDER_RING ? " [RCS] " : "",
+		  __entry->hung_engines & BSD_RING ? 	" [VCS] " : "",
+		  __entry->hung_engines & BLT_RING ? 	" [BCS] " : "",
+		  __entry->hung_engines & VEBOX_RING ? 	" [VECS] " : "",
+		  __entry->hung_engines & BSD2_RING ? 	" [VCS2] " : "",
+		  __entry->hung_engines ? ")":"",
+		  __entry->watchdog ? "true" : "false",
+		  __entry->full_reset ?
+			(__entry->hung_engines ? "true [PROMOTED]" : "true") :
+				"false")
+);
+
+/**
+ * DOC: i915_tdr_engine_save
+ *
+ * This tracepoint tracks the point of engine state save during the engine
+ * recovery path. Logs the head pointer position at point of hang, the position
+ * after recovering and whether or not we forced a head pointer advancement or
+ * rounded up to an aligned QWORD position.
+ */
+TRACE_EVENT(i915_tdr_engine_save,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 u32 old_head,
+		 u32 new_head,
+		 bool forced_advance),
+
+	TP_ARGS(engine, old_head, new_head, forced_advance),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(u32, old_head)
+			__field(u32, new_head)
+			__field(bool, forced_advance)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->old_head = old_head;
+			__entry->new_head = new_head;
+			__entry->forced_advance = forced_advance;
+	),
+
+	TP_printk("dev=%u, engine=%s, old_head=%u, new_head=%u, forced_advance=%s",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->old_head,
+		  __entry->new_head,
+		  __entry->forced_advance ? "true" : "false")
+);
+
+/**
+ * DOC: i915_tdr_gpu_reset_complete
+ *
+ * This tracepoint tracks the point of full GPU reset completion
+ */
+TRACE_EVENT(i915_tdr_gpu_reset_complete,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(struct drm_device *, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev;
+	),
+
+	TP_printk("dev=%u, resets=%u",
+		__entry->dev->primary->index,
+		i915_reset_count(&((struct drm_i915_private *)
+			(__entry->dev)->dev_private)->gpu_error) )
+);
+
+/**
+ * DOC: i915_tdr_engine_reset_complete
+ *
+ * This tracepoint tracks the point of engine reset completion
+ */
+TRACE_EVENT(i915_tdr_engine_reset_complete,
+	TP_PROTO(struct intel_engine_cs *engine),
+
+	TP_ARGS(engine),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+	),
+
+	TP_printk("dev=%u, engine=%s, resets=%u",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->engine->hangcheck.reset_count)
+);
+
+/**
+ * DOC: i915_tdr_forced_csb_check
+ *
+ * This tracepoint tracks the occurences of forced CSB checks
+ * that the driver does when detecting inconsistent context
+ * submission states between the driver state and the current
+ * CPU engine state.
+ */
+TRACE_EVENT(i915_tdr_forced_csb_check,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 bool was_effective),
+
+	TP_ARGS(engine, was_effective),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(bool, was_effective)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->was_effective = was_effective;
+	),
+
+	TP_printk("dev=%u, engine=%s, was_effective=%s",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->was_effective ? "yes" : "no")
+);
+
+/**
+ * DOC: i915_tdr_hang_check
+ *
+ * This tracepoint tracks hang checks on each engine.
+ */
+TRACE_EVENT(i915_tdr_hang_check,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 u32 seqno,
+		 u64 acthd,
+		 bool busy),
+
+	TP_ARGS(engine, seqno, acthd, busy),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(u32, seqno)
+			__field(u64, acthd)
+			__field(bool, busy)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->seqno = seqno;
+			__entry->acthd = acthd;
+			__entry->busy = busy;
+	),
+
+	TP_printk("dev=%u, engine=%s, seqno=%u, last seqno=%u, acthd=%lu, score=%d%s, action=%u [%s], busy=%s",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->seqno,
+		  __entry->engine->hangcheck.seqno,
+		  (long unsigned int) __entry->acthd,
+		  __entry->engine->hangcheck.score,
+		  (__entry->engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) ? " [HUNG]" : "",
+		  (unsigned int) __entry->engine->hangcheck.action,
+		  hangcheck_action_to_str(__entry->engine->hangcheck.action),
+		  __entry->busy ? "yes" : "no")
+);
+
 #endif /* _I915_TRACE_H_ */
 
 /* This part must be outside protection */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 476eff0..a8f2954 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1663,7 +1663,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
 	struct intel_context *ctx;
 	int ret = 0;
 	int clamp_to_tail = 0;
-	uint32_t head;
+	uint32_t head, old_head;
 	uint32_t tail;
 	uint32_t head_addr;
 	uint32_t tail_addr;
@@ -1678,7 +1678,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
 	 * Read head from MMIO register since it contains the
 	 * most up to date value of head at this point.
 	 */
-	head = I915_READ_HEAD(ring);
+	old_head = head = I915_READ_HEAD(ring);
 
 	/*
 	 * Read tail from the context because the execlist queue
@@ -1735,6 +1735,9 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
 	head |= (head_addr & HEAD_ADDR);
 	ring->saved_head = head;
 
+	trace_i915_tdr_engine_save(ring, old_head,
+		head, force_advance);
+
 	return 0;
 }
 
@@ -2638,6 +2641,7 @@ bool intel_execlists_TDR_force_CSB_check(struct drm_i915_private *dev_priv,
 		DRM_ERROR("Forced CSB check of %s ineffective!\n", engine->name);
 	spin_unlock_irqrestore(&engine->execlist_lock, flags);
 
+	trace_i915_tdr_forced_csb_check(engine, !!was_effective);
 	wake_up_all(&engine->irq_queue);
 
 	return !!was_effective;
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 91427ac..67a34aa 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1442,6 +1442,8 @@ static int gen6_do_reset(struct drm_device *dev)
 	/* Spin waiting for the device to ack the reset request */
 	ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_FULL) == 0, 500);
 
+	trace_i915_tdr_gpu_reset_complete(dev);
+
 	intel_uncore_forcewake_reset(dev, true);
 
 	return ret;
@@ -1520,6 +1522,8 @@ static int do_engine_reset_nolock(struct intel_engine_cs *engine)
 		break;
 	}
 
+	trace_i915_tdr_engine_reset_complete(engine);
+
 	return ret;
 }
 
-- 
1.9.1