[Intel-gfx] [PATCH 14/20] drm/i915: TDR/watchdog trace points.

Thu Oct 22 18:32:36 PDT 2015

Defined trace points and sprinkled the usage of these throughout the
TDR/watchdog implementation.

The following trace points are supported:

	1. trace_i915_tdr_gpu_recovery:
	Called at the onset of the full GPU reset recovery path.

	2. trace_i915_tdr_engine_recovery:
	Called at the onset of the per-engine recovery path.

	3. i915_tdr_recovery_start:
	Called at the onset of hang recovery before recovery mode has been
	decided.

	4. i915_tdr_recovery_complete:
	Called at the point of hang recovery completion.

	5. i915_tdr_recovery_queued:
	Called once the error handler decides to schedule the actual hang
	recovery, which marks the end of the hang detection path.

	6. i915_tdr_engine_save:
	Called at the point of saving the engine state during per-engine hang
	recovery.

	7. i915_tdr_gpu_reset_complete:
	Called at the point of full GPU reset recovery completion.

	8. i915_tdr_engine_reset_complete:
	Called at the point of per-engine recovery completion.

	9. i915_tdr_forced_csb_check:
	Called at the completion of a forced CSB check.

	10. i915_tdr_hang_check:
	Called for every engine in the periodic hang checker loop before moving
	on to the next engine. Provides an overview of all hang check stats in
	real-time. The collected stats are:

		a. Engine name.

		b. Current engine seqno.

		c. Seqno of previous hang check iteration for that engine.

		d. ACTHD register value of given engine.

		e. Current hang check score of given engine (and whether or not
		the engine has been detected as hung).

		f. Current action for given engine.

		g. Busyness of given engine.

		h. Submission status of currently running context on given engine.

	11. i915_tdr_inconsistency:
	Called when an inconsistency is detected to provide more information in
	the log about the nature of the inconsistency. The collected
	information is:

		a. Engine name.

		b. ID of the currently executing context on hardware.

		c. Is the given engine idle or not?

		d. The ID of the context that was most recently submitted to
		the ELSP port from the execlist queue for the given engine.

		e. The submission/IRQ balance of the request most recently
		submitted to hardware (elsp_submitted).

Signed-off-by: Tomas Elf <tomas.elf at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c       |   3 +
 drivers/gpu/drm/i915/i915_drv.h       |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.c |   2 +-
 drivers/gpu/drm/i915/i915_irq.c       |  11 +-
 drivers/gpu/drm/i915/i915_trace.h     | 339 ++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c      |  21 ++-
 drivers/gpu/drm/i915/intel_uncore.c   |   4 +
 7 files changed, 377 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c32c475..d482341 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -896,6 +896,7 @@ int i915_reset(struct drm_device *dev)
 	bool simulated;
 	int ret;
 
+	trace_i915_tdr_gpu_recovery(dev);
 	intel_reset_gt_powersave(dev);
 
 	mutex_lock(&dev->struct_mutex);
@@ -1120,6 +1121,8 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
+	trace_i915_tdr_engine_recovery(engine);
+
         /* Take wake lock to prevent power saving mode */
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7ebf800..5b54675 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3335,6 +3335,7 @@ void i915_destroy_error_state(struct drm_device *dev);
 
 void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone);
 const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a);
 
 /* i915_cmd_parser.c */
 int i915_cmd_parser_get_version(void);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 2f04e4f..8c0cd0e 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -221,7 +221,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	}
 }
 
-static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
 {
 	switch (a) {
 	case HANGCHECK_IDLE:
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 71208de..8fe972b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2454,6 +2454,7 @@ static void i915_error_work_func(struct work_struct *work)
 
 	mutex_lock(&dev->struct_mutex);
 
+	trace_i915_tdr_recovery_start(dev);
 	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
 
 	for_each_ring(ring, dev_priv, i) {
@@ -2572,6 +2573,7 @@ static void i915_error_work_func(struct work_struct *work)
 			kobject_uevent_env(&dev->primary->kdev->kobj,
 					   KOBJ_CHANGE, reset_done_event);
 	}
+	trace_i915_tdr_recovery_complete(dev);
 }
 
 static void i915_report_and_clear_eir(struct drm_device *dev)
@@ -2697,6 +2699,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	va_list args;
 	char error_msg[80];
+	bool full_reset = true;
 
 	struct intel_engine_cs *engine;
 
@@ -2715,7 +2718,6 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 		 *	2. The hardware does not support it (pre-gen7).
 		 *	3. We already tried per-engine reset recently.
 		 */
-		bool full_reset = true;
 
 		if (!i915.enable_engine_reset) {
 			DRM_INFO("Engine reset disabled: Using full GPU reset.\n");
@@ -2744,6 +2746,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 						i915.gpu_reset_promotion_time;
 
 					engine->hangcheck.last_engine_reset_time = now;
+
 				} else {
 					/*
 					 * Watchdog timeout always results
@@ -2792,6 +2795,8 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 		i915_error_wake_up(dev_priv, false);
 	}
 
+	trace_i915_tdr_recovery_queued(dev, engine_mask, watchdog, full_reset);
+
 	/*
 	 * Gen 7:
 	 *
@@ -3158,6 +3163,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	for_each_ring(ring, dev_priv, i) {
 		u64 acthd;
 		u32 seqno;
+		u32 head;
 		bool busy = true;
 
 		semaphore_clear_deadlocks(dev_priv);
@@ -3234,7 +3240,10 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 
 		ring->hangcheck.seqno = seqno;
 		ring->hangcheck.acthd = acthd;
+		head = I915_READ_HEAD(ring);
 		busy_count += busy;
+
+		trace_i915_tdr_hang_check(ring, seqno, acthd, head, busy);
 	}
 
 	for_each_ring(ring, dev_priv, i) {
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 04fe849..8d86a7d 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -810,6 +810,345 @@ TRACE_EVENT(switch_mm,
 		  __entry->dev, __entry->ring, __entry->to, __entry->vm)
 );
 
+/**
+ * DOC: i915_tdr_gpu_recovery
+ *
+ * This tracepoint tracks the onset of the full GPU recovery path
+ */
+TRACE_EVENT(i915_tdr_gpu_recovery,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+	),
+
+	TP_printk("dev=%u, full GPU recovery started",
+		  __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_engine_recovery
+ *
+ * This tracepoint tracks the onset of the engine recovery path
+ */
+TRACE_EVENT(i915_tdr_engine_recovery,
+	TP_PROTO(struct intel_engine_cs *engine),
+
+	TP_ARGS(engine),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+	),
+
+	TP_printk("dev=%u, engine=%u, recovery of %s started",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->id,
+		  __entry->engine->name)
+);
+
+/**
+ * DOC: i915_tdr_recovery_start
+ *
+ * This tracepoint tracks hang recovery start
+ */
+TRACE_EVENT(i915_tdr_recovery_start,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+	),
+
+	TP_printk("dev=%u, hang recovery started",
+		  __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_complete
+ *
+ * This tracepoint tracks hang recovery completion
+ */
+TRACE_EVENT(i915_tdr_recovery_complete,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+	),
+
+	TP_printk("dev=%u, hang recovery completed",
+		  __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_queued
+ *
+ * This tracepoint tracks the point of queuing recovery from hang check.
+ * If engine recovery is requested engine name will be displayed, otherwise
+ * it will be set to "none". If too many engine reset was attempted in the
+ * previous history we promote to full GPU reset, which is remarked by appending
+ * the "[PROMOTED]" flag.
+ */
+TRACE_EVENT(i915_tdr_recovery_queued,
+	TP_PROTO(struct drm_device *dev,
+		 u32 hung_engines,
+		 bool watchdog,
+		 bool full_reset),
+
+	TP_ARGS(dev, hung_engines, watchdog, full_reset),
+
+	TP_STRUCT__entry(
+			__field(u32, dev)
+			__field(u32, hung_engines)
+			__field(bool, watchdog)
+			__field(bool, full_reset)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev->primary->index;
+			__entry->hung_engines = hung_engines;
+			__entry->watchdog = watchdog;
+			__entry->full_reset = full_reset;
+	),
+
+	TP_printk("dev=%u, hung_engines=0x%02x%s%s%s%s%s%s%s, watchdog=%s, full_reset=%s",
+		  __entry->dev,
+		  __entry->hung_engines,
+		  __entry->hung_engines ? " (":"",
+		  __entry->hung_engines & RENDER_RING ? " [RCS] " : "",
+		  __entry->hung_engines & BSD_RING ? 	" [VCS] " : "",
+		  __entry->hung_engines & BLT_RING ? 	" [BCS] " : "",
+		  __entry->hung_engines & VEBOX_RING ? 	" [VECS] " : "",
+		  __entry->hung_engines & BSD2_RING ? 	" [VCS2] " : "",
+		  __entry->hung_engines ? ")":"",
+		  __entry->watchdog ? "true" : "false",
+		  __entry->full_reset ?
+			(__entry->hung_engines ? "true [PROMOTED]" : "true") :
+				"false")
+);
+
+/**
+ * DOC: i915_tdr_engine_save
+ *
+ * This tracepoint tracks the point of engine state save during the engine
+ * recovery path. Logs the head pointer position at point of hang, the position
+ * after recovering and whether or not we forced a head pointer advancement or
+ * rounded up to an aligned QWORD position.
+ */
+TRACE_EVENT(i915_tdr_engine_save,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 u32 old_head,
+		 u32 new_head,
+		 bool forced_advance),
+
+	TP_ARGS(engine, old_head, new_head, forced_advance),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(u32, old_head)
+			__field(u32, new_head)
+			__field(bool, forced_advance)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->old_head = old_head;
+			__entry->new_head = new_head;
+			__entry->forced_advance = forced_advance;
+	),
+
+	TP_printk("dev=%u, engine=%s, old_head=%u, new_head=%u, forced_advance=%s",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->old_head,
+		  __entry->new_head,
+		  __entry->forced_advance ? "true" : "false")
+);
+
+/**
+ * DOC: i915_tdr_gpu_reset_complete
+ *
+ * This tracepoint tracks the point of full GPU reset completion
+ */
+TRACE_EVENT(i915_tdr_gpu_reset_complete,
+	TP_PROTO(struct drm_device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+			__field(struct drm_device *, dev)
+	),
+
+	TP_fast_assign(
+			__entry->dev = dev;
+	),
+
+	TP_printk("dev=%u, resets=%u",
+		__entry->dev->primary->index,
+		i915_reset_count(&((struct drm_i915_private *)
+			(__entry->dev)->dev_private)->gpu_error) )
+);
+
+/**
+ * DOC: i915_tdr_engine_reset_complete
+ *
+ * This tracepoint tracks the point of engine reset completion
+ */
+TRACE_EVENT(i915_tdr_engine_reset_complete,
+	TP_PROTO(struct intel_engine_cs *engine),
+
+	TP_ARGS(engine),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+	),
+
+	TP_printk("dev=%u, engine=%s, resets=%u",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->engine->hangcheck.reset_count)
+);
+
+/**
+ * DOC: i915_tdr_forced_csb_check
+ *
+ * This tracepoint tracks the occurences of forced CSB checks
+ * that the driver does when detecting inconsistent context
+ * submission states between the driver state and the current
+ * CPU engine state.
+ */
+TRACE_EVENT(i915_tdr_forced_csb_check,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 bool was_effective),
+
+	TP_ARGS(engine, was_effective),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(bool, was_effective)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->was_effective = was_effective;
+	),
+
+	TP_printk("dev=%u, engine=%s, was_effective=%s",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->was_effective ? "yes" : "no")
+);
+
+/**
+ * DOC: i915_tdr_hang_check
+ *
+ * This tracepoint tracks hang checks on each engine.
+ */
+TRACE_EVENT(i915_tdr_hang_check,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 u32 seqno,
+		 u64 acthd,
+		 u32 hd,
+		 bool busy),
+
+	TP_ARGS(engine, seqno, acthd, hd, busy),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(u32, seqno)
+			__field(u64, acthd)
+			__field(u32, hd)
+			__field(bool, busy)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->seqno = seqno;
+			__entry->acthd = acthd;
+			__entry->hd = hd;
+			__entry->busy = busy;
+	),
+
+	TP_printk("dev=%u, engine=%s, seqno=%u (%d), last seqno=%u (%d), head=%u (%d), acthd=%lu, score=%d%s, action=%u [%s], busy=%s",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->seqno,
+		  __entry->seqno,
+		  __entry->engine->hangcheck.seqno,
+		  __entry->engine->hangcheck.seqno,
+		  __entry->hd,
+		  __entry->hd,
+		  (long unsigned int) __entry->acthd,
+		  __entry->engine->hangcheck.score,
+		  (__entry->engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) ? " [HUNG]" : "",
+		  (unsigned int) __entry->engine->hangcheck.action,
+		  hangcheck_action_to_str(__entry->engine->hangcheck.action),
+		  __entry->busy ? "yes" : "no")
+);
+
+/**
+ * DOC: i915_tdr_inconsistency
+ *
+ * This tracepoint tracks detected inconsistencies
+ */
+TRACE_EVENT(i915_tdr_inconsistency,
+	TP_PROTO(struct intel_engine_cs *engine,
+		 u32 hw_context,
+		 bool hw_active,
+		 u32 sw_context,
+		 struct drm_i915_gem_request *req),
+
+	TP_ARGS(engine, hw_context, hw_active, sw_context, req),
+
+	TP_STRUCT__entry(
+			__field(struct intel_engine_cs *, engine)
+			__field(u32, hw_context)
+			__field(bool, hw_active)
+			__field(u32, sw_context)
+			__field(int, elsp_submitted)
+	),
+
+	TP_fast_assign(
+			__entry->engine = engine;
+			__entry->hw_context = hw_context;
+			__entry->hw_active = hw_active;
+			__entry->sw_context = sw_context;
+			__entry->elsp_submitted = req?req->elsp_submitted:0;
+	),
+
+	TP_printk("dev=%u, engine=%s, hw_context=%x, hw_active=%s, sw_context=%x, elsp_submitted=%d",
+		  __entry->engine->dev->primary->index,
+		  __entry->engine->name,
+		  __entry->hw_context,
+		  __entry->hw_active?"true":"false",
+		  __entry->sw_context,
+		  __entry->elsp_submitted)
+);
+
 #endif /* _I915_TRACE_H_ */
 
 /* This part must be outside protection */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5bb7d6e..96f5a28 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2285,7 +2285,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
 	struct intel_context *ctx;
 	int ret = 0;
 	int clamp_to_tail = 0;
-	uint32_t head;
+	uint32_t head, old_head;
 	uint32_t tail;
 	uint32_t head_addr;
 	uint32_t tail_addr;
@@ -2300,7 +2300,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
 	 * Read head from MMIO register since it contains the
 	 * most up to date value of head at this point.
 	 */
-	head = I915_READ_HEAD(ring);
+	old_head = head = I915_READ_HEAD(ring);
 
 	/*
 	 * Read tail from the context because the execlist queue
@@ -2357,6 +2357,9 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
 	head |= (head_addr & HEAD_ADDR);
 	ring->saved_head = head;
 
+	trace_i915_tdr_engine_save(ring, old_head,
+		head, force_advance);
+
 	return 0;
 }
 
@@ -3264,6 +3267,19 @@ intel_execlists_TDR_get_current_request(struct intel_engine_cs *ring,
 			CONTEXT_SUBMISSION_STATUS_NONE_SUBMITTED;
 	}
 
+	/*
+	 * This may or may not be a sustained inconsistency. Most of the time
+	 * it's only a matter of a transitory inconsistency during context
+	 * submission/completion but if we happen to detect a sustained
+	 * inconsistency then it helps to have more information.
+	 */
+	if (status == CONTEXT_SUBMISSION_STATUS_INCONSISTENT)
+		trace_i915_tdr_inconsistency(ring,
+					     hw_context,
+					     hw_active,
+					     sw_context,
+					     tmpreq);
+
 	if (req)
 		*req = tmpreq;
 
@@ -3328,6 +3344,7 @@ bool intel_execlists_TDR_force_CSB_check(struct drm_i915_private *dev_priv,
 
 	spin_unlock_irqrestore(&engine->execlist_lock, flags);
 
+	trace_i915_tdr_forced_csb_check(engine, !!was_effective);
 	wake_up_all(&engine->irq_queue);
 
 	return !!was_effective;
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 400b679..625eda8 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1464,6 +1464,8 @@ static int gen6_do_reset(struct drm_device *dev)
 	/* Spin waiting for the device to ack the reset request */
 	ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_FULL) == 0, 500);
 
+	trace_i915_tdr_gpu_reset_complete(dev);
+
 	intel_uncore_forcewake_reset(dev, true);
 
 	return ret;
@@ -1620,6 +1622,8 @@ static int do_engine_reset_nolock(struct intel_engine_cs *engine)
 		break;
 	}
 
+	trace_i915_tdr_engine_reset_complete(engine);
+
 	return ret;
 }
 
-- 
1.9.1