[Intel-gfx] [PATCH 15/16] drm/i915: Mechanism to forward clock monotonic time in perf samples

sourab.gupta at intel.com sourab.gupta at intel.com
Fri Apr 22 11:34:04 UTC 2016


From: Sourab Gupta <sourab.gupta at intel.com>

Currently, we have the ability to only forward the GPU timestamps in the
samples (which are generated via OA reports or PIPE_CONTROL commands
inserted in the ring). This limits the ability to correlate these samples
with the system events. If we scale the GPU timestamps according the
timestamp base/frequency info present in bspec, it is observed that the
timestamps drift really quickly from the system time.

An ability is therefore needed to report timestamps in different clock
domains, such as CLOCK_MONOTONIC, in the perf samples to be of more
practical use to the userspace. This ability becomes important when
we want to correlate/plot GPU events/samples with other system events
on the same timeline (e.g. vblank events, or timestamps when work was
submitted to kernel, etc.)

The patch here proposes a mechanism to achieve this. The gpu time and
CLOCK_MONOTONIC system time are correlated to detect and correct the
error in published gpu timestamp clock frequency. The userspace can
request CLOCK_MONOTONIC in samples by requesting the corresponding
property while opening the stream.

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |  12 ++-
 drivers/gpu/drm/i915/i915_perf.c | 218 ++++++++++++++++++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_reg.h  |  10 ++
 include/uapi/drm/i915_drm.h      |   9 +-
 4 files changed, 230 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0923a17..e6a1a93 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1749,6 +1749,9 @@ struct i915_perf_stream {
 	/* Whether the OA unit is in use */
 	bool using_oa;
 
+	/* monotonic clk ts for last sample */
+	u64 last_sample_ts;
+
 	/* Enables the collection of HW samples, either in response to
 	 * I915_PERF_IOCTL_ENABLE or implicitly called when stream is
 	 * opened without I915_PERF_FLAG_DISABLED.
@@ -2144,6 +2147,14 @@ struct drm_i915_private {
 		struct i915_perf_stream *ring_stream[I915_NUM_ENGINES];
 		wait_queue_head_t poll_wq[I915_NUM_ENGINES];
 
+		/* Timekeeping Info */
+		u64 clk_mono; /* last monotonic clk value */
+		u64 gpu_time; /* last gpu time value */
+		s64 clk_offset; /* Offset between clk mono and gpu time */
+		u32 timestamp_frequency;
+		u32 resync_period; /* in msecs */
+		struct delayed_work clk_sync_work;
+
 		struct {
 			u32 specific_ctx_id;
 
@@ -2152,7 +2163,6 @@ struct drm_i915_private {
 
 			bool periodic;
 			int period_exponent;
-			int timestamp_frequency;
 
 			int tail_margin;
 
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index abb9d04..af9ec93 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -61,6 +61,12 @@
 #define POLL_FREQUENCY 200
 #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY)
 
+/* Max period for clock synchronization. Defined as 25 seconds, as this is seen
+ * to give best results.
+ */
+#define MAX_CLK_SYNC_PERIOD (25*MSEC_PER_SEC)
+#define INIT_CLK_SYNC_PERIOD (20) /* in msecs */
+
 static u32 i915_perf_stream_paranoid = true;
 
 /* The maximum exponent the hardware accepts is 63 (essentially it selects one
@@ -93,7 +99,8 @@ struct sample_data {
 	u32 ctx_id;
 	u32 pid;
 	u32 tag;
-	u64 ts;
+	u64 gpu_ts;
+	u64 clk_mono;
 	const u8 *report;
 };
 
@@ -142,6 +149,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
 #define SAMPLE_PID		(1<<3)
 #define SAMPLE_TAG		(1<<4)
 #define SAMPLE_TS		(1<<5)
+#define SAMPLE_CLK_MONO		(1<<6)
 
 struct perf_open_properties {
 	u32 sample_flags;
@@ -232,7 +240,7 @@ static int insert_perf_entry(struct drm_i915_private *dev_priv,
 
 	if (stream->sample_flags & SAMPLE_OA_REPORT)
 		entry_size += dev_priv->perf.oa.oa_buffer.format_size;
-	else if (sample_flags & SAMPLE_TS) {
+	else if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO)) {
 		/*
 		 * XXX: Since TS data can anyways be derived from OA report, so
 		 * no need to capture it for RCS engine, if capture oa data is
@@ -501,7 +509,7 @@ static void i915_ring_stream_cs_hook(struct i915_perf_stream *stream,
 		ret = i915_ring_stream_capture_oa(req, entry->oa_offset);
 		if (ret)
 			goto err_unref;
-	} else if (sample_flags & SAMPLE_TS) {
+	} else if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO)) {
 		/*
 		 * XXX: Since TS data can anyways be derived from OA report, so
 		 * no need to capture it for RCS engine, if capture oa data is
@@ -758,7 +766,13 @@ static int append_sample(struct i915_perf_stream *stream,
 	}
 
 	if (sample_flags & SAMPLE_TS) {
-		if (copy_to_user(buf, &data->ts, I915_PERF_TS_SAMPLE_SIZE))
+		if (copy_to_user(buf, &data->gpu_ts, I915_PERF_TS_SAMPLE_SIZE))
+			return -EFAULT;
+		buf += I915_PERF_TS_SAMPLE_SIZE;
+	}
+
+	if (sample_flags & SAMPLE_CLK_MONO) {
+		if (copy_to_user(buf, &data->clk_mono, I915_PERF_TS_SAMPLE_SIZE))
 			return -EFAULT;
 		buf += I915_PERF_TS_SAMPLE_SIZE;
 	}
@@ -775,6 +789,40 @@ static int append_sample(struct i915_perf_stream *stream,
 	return 0;
 }
 
+static u64 get_current_gpu_ts(struct drm_i915_private *dev_priv)
+{
+	return	((u64)I915_READ(GT_TIMESTAMP_COUNT_UDW) << 32) |
+		I915_READ(GT_TIMESTAMP_COUNT);
+}
+
+static u64 get_clk_mono_from_gpu_ts(struct i915_perf_stream *stream,
+					u64 gpu_ts)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	u64 remainder, ts_interval = NSEC_PER_SEC;
+	u64 gpu_freq = dev_priv->perf.timestamp_frequency;
+	u64 gpu_time, clk_mono;
+
+	remainder = do_div(ts_interval, gpu_freq);
+
+	remainder *= gpu_ts;
+	do_div(remainder, gpu_freq);
+	gpu_time = (ts_interval*gpu_ts) + remainder;
+
+	clk_mono = gpu_time - dev_priv->perf.clk_offset;
+
+	/* Ensure monotonicity by clamping the system time if it tries to
+	 * go backwards. This may happen during re-syncing clocks, when the
+	 * gpu clock is faster.
+	 * FIXME: Any other mechanism to ensure monotonicity?
+	 */
+	if (clk_mono < stream->last_sample_ts)
+		clk_mono = stream->last_sample_ts;
+
+	stream->last_sample_ts = clk_mono;
+	return clk_mono;
+}
+
 static u64 get_gpu_ts_from_oa_report(struct drm_i915_private *dev_priv,
 					const u8 *report)
 {
@@ -831,7 +879,13 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream,
 
 	/* Derive timestamp from OA report */
 	if (sample_flags & SAMPLE_TS)
-		data.ts = get_gpu_ts_from_oa_report(dev_priv, report);
+		data.gpu_ts = get_gpu_ts_from_oa_report(dev_priv, report);
+
+	if (sample_flags & SAMPLE_CLK_MONO) {
+		u64 gpu_ts = get_gpu_ts_from_oa_report(dev_priv, report);
+
+		data.clk_mono = get_clk_mono_from_gpu_ts(stream, gpu_ts);
+	}
 
 	if (sample_flags & SAMPLE_OA_REPORT)
 		data.report = report;
@@ -1261,7 +1315,7 @@ static int append_one_cs_sample(struct i915_perf_stream *stream,
 		if (ret)
 			return ret;
 
-		if (sample_flags & SAMPLE_TS)
+		if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO))
 			gpu_ts = get_gpu_ts_from_oa_report(dev_priv, report);
 	}
 
@@ -1283,7 +1337,7 @@ static int append_one_cs_sample(struct i915_perf_stream *stream,
 		dev_priv->perf.last_tag = node->tag;
 	}
 
-	if (sample_flags & SAMPLE_TS) {
+	if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO)) {
 		/* If OA sampling is enabled, derive the ts from OA report.
 		 * Else, forward the timestamp collected via command stream.
 		 */
@@ -1291,7 +1345,12 @@ static int append_one_cs_sample(struct i915_perf_stream *stream,
 			gpu_ts = *(u64 *)
 				(dev_priv->perf.command_stream_buf[id].addr +
 					node->ts_offset);
-		data.ts = gpu_ts;
+
+		if (sample_flags & SAMPLE_TS)
+			data.gpu_ts = gpu_ts;
+		if (sample_flags & SAMPLE_CLK_MONO)
+			data.clk_mono = get_clk_mono_from_gpu_ts(stream,
+								gpu_ts);
 	}
 
 	return append_sample(stream, read_state, &data);
@@ -2039,17 +2098,118 @@ static void gen8_oa_enable(struct drm_i915_private *dev_priv)
 				   GEN8_OA_COUNTER_ENABLE);
 }
 
+static void i915_perf_get_clock(struct drm_i915_private *dev_priv,
+			u64 *clk_mono, u64 *gpu_time, u64 *gpu_ts)
+{
+	u64 remainder, ts_interval = NSEC_PER_SEC;
+	u32 gpu_freq = dev_priv->perf.timestamp_frequency;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	*clk_mono = ktime_get_mono_fast_ns();
+	*gpu_ts = get_current_gpu_ts(dev_priv);
+	local_irq_restore(flags);
+
+	remainder = do_div(ts_interval, gpu_freq);
+	remainder *= *gpu_ts;
+	do_div(remainder, gpu_freq);
+
+	*gpu_time = ((*gpu_ts) * ts_interval) + remainder;
+}
+
+static void i915_perf_clock_sync_work(struct work_struct *work)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(work, typeof(*dev_priv), perf.clk_sync_work.work);
+	u64 last_clk_mono = dev_priv->perf.clk_mono;
+	u64 last_gpu_time = dev_priv->perf.gpu_time;
+	u64 clk_mono, clk_mono_offset, gpu_time, gpu_time_offset, gpu_ts;
+	u64 gpu_freq = dev_priv->perf.timestamp_frequency;
+	u64 remainder, ts_interval = NSEC_PER_SEC;
+	s64 delta, freq_delta;
+
+	i915_perf_get_clock(dev_priv, &clk_mono, &gpu_time, &gpu_ts);
+
+	clk_mono_offset = clk_mono - last_clk_mono;
+	gpu_time_offset = gpu_time - last_gpu_time;
+
+	/* delta time in ns */
+	delta = gpu_time_offset - clk_mono_offset;
+
+	/* If time delta < 1 us, we can assume gpu frequency is correct */
+	if (abs(delta) < NSEC_PER_USEC)
+		goto out;
+
+	/* The two clocks shouldn't deviate more than 1 second during the
+	 * resync period. If this is the case (which may happen due to
+	 * suspend/resume), then don't apply frequency correction, and
+	 * fast forward/rewind the clocks to resync immediately
+	 */
+	if (abs(delta) > NSEC_PER_SEC)
+		goto out;
+
+	/* Calculate frequency delta */
+	freq_delta = abs(delta)*gpu_freq;
+	do_div(freq_delta, clk_mono_offset);
+
+	if (delta < 0)
+		freq_delta = -freq_delta;
+
+	dev_priv->perf.timestamp_frequency += freq_delta;
+
+	/*
+	 * Calculate updated gpu_time based on corrected frequency.
+	 * Note that this may cause jumps in gpu time depending on whether
+	 * frequency delta is positive or negative.
+	 * NB: Take care that monotonicity of sample timestamps is maintained
+	 * even with these jumps.
+	 */
+	gpu_freq = dev_priv->perf.timestamp_frequency;
+	remainder = do_div(ts_interval, gpu_freq);
+
+	remainder *= gpu_ts;
+	do_div(remainder, gpu_freq);
+	gpu_time = (ts_interval*gpu_ts) + remainder;
+
+out:
+	dev_priv->perf.clk_mono = clk_mono;
+	dev_priv->perf.gpu_time = gpu_time;
+	dev_priv->perf.clk_offset = dev_priv->perf.gpu_time -
+					dev_priv->perf.clk_mono;
+
+	/* We can schedule next synchronization at incrementally higher
+	 * durations, so that the accuracy of our calculated frequency
+	 * can improve over time. The max resync period is arbitrarily
+	 * set as one hour.
+	 */
+	dev_priv->perf.resync_period *= 2;
+	if (dev_priv->perf.resync_period < MAX_CLK_SYNC_PERIOD)
+		schedule_delayed_work(&dev_priv->perf.clk_sync_work,
+			msecs_to_jiffies(dev_priv->perf.resync_period));
+}
+
 static void i915_ring_stream_enable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
 	if (stream->sample_flags & SAMPLE_OA_REPORT) {
-		dev_priv->perf.oa.last_gpu_ts =
-			((u64)I915_READ(GT_TIMESTAMP_COUNT_UDW) << 32) |
-			I915_READ(GT_TIMESTAMP_COUNT);
+		dev_priv->perf.oa.last_gpu_ts = get_current_gpu_ts(dev_priv);
 		dev_priv->perf.oa.ops.oa_enable(dev_priv);
 	}
 
+	if (stream->sample_flags & SAMPLE_CLK_MONO) {
+		u64 gpu_ts;
+
+		i915_perf_get_clock(dev_priv, &dev_priv->perf.clk_mono,
+					&dev_priv->perf.gpu_time, &gpu_ts);
+		dev_priv->perf.clk_offset = dev_priv->perf.gpu_time -
+						dev_priv->perf.clk_mono;
+
+		if (dev_priv->perf.resync_period < MAX_CLK_SYNC_PERIOD)
+			schedule_delayed_work(&dev_priv->perf.clk_sync_work,
+				msecs_to_jiffies(dev_priv->perf.resync_period));
+	}
+
 	if (stream->cs_mode)
 		stream->command_stream_hook = i915_ring_stream_cs_hook;
 
@@ -2073,6 +2233,8 @@ static void i915_ring_stream_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
+	cancel_delayed_work_sync(&dev_priv->perf.clk_sync_work);
+
 	if (stream->cs_mode || dev_priv->perf.oa.periodic)
 		hrtimer_cancel(&dev_priv->perf.poll_check_timer);
 
@@ -2089,7 +2251,7 @@ static void i915_ring_stream_disable(struct i915_perf_stream *stream)
 static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
 {
 	return 1000000000ULL * (2ULL << exponent) /
-		dev_priv->perf.oa.timestamp_frequency;
+		dev_priv->perf.timestamp_frequency;
 }
 
 static int i915_ring_stream_init(struct i915_perf_stream *stream,
@@ -2102,7 +2264,8 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream,
 	bool require_cs_mode = props->sample_flags & (SAMPLE_PID |
 						      SAMPLE_TAG);
 	bool cs_sample_data = props->sample_flags & (SAMPLE_OA_REPORT |
-							SAMPLE_TS);
+							SAMPLE_TS |
+							SAMPLE_CLK_MONO);
 	int ret;
 
 	if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) {
@@ -2249,6 +2412,19 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream,
 			require_cs_mode = true;
 	}
 
+	if (props->sample_flags & SAMPLE_CLK_MONO) {
+		stream->sample_flags |= SAMPLE_CLK_MONO;
+		stream->sample_size += I915_PERF_TS_SAMPLE_SIZE;
+
+		/*
+		 * NB: it's meaningful to request SAMPLE_CLK_MONO with just CS
+		 * mode or periodic OA mode sampling but we don't allow
+		 * SAMPLE_CLK_MONO without either mode
+		 */
+		if (!require_oa_unit)
+			require_cs_mode = true;
+	}
+
 	if (require_cs_mode && !props->cs_mode) {
 		DRM_ERROR(
 			"PID, TAG or TS sampling require a ring to be specified");
@@ -2273,11 +2449,13 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream,
 
 		/*
 		 * The only time we should allow enabling CS mode if it's not
-		 * strictly required, is if SAMPLE_CTX_ID  or SAMPLE_TS has been
-		 * requested, as they're usable with periodic OA or CS sampling.
+		 * strictly required, is if SAMPLE_CTX_ID, SAMPLE_TS, or
+		 * SAMPLE_CLK_MONO has been requested, as they're usable with
+		 * periodic OA or CS sampling.
 		 */
 		if (!require_cs_mode &&
-		    !(props->sample_flags & (SAMPLE_CTX_ID|SAMPLE_TS))) {
+		    !(props->sample_flags &
+				(SAMPLE_CTX_ID|SAMPLE_TS|SAMPLE_CLK_MONO))) {
 			DRM_ERROR(
 				"Ring given without requesting any CS specific property");
 			ret = -EINVAL;
@@ -2955,6 +3133,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_PROP_SAMPLE_TS:
 			props->sample_flags |= SAMPLE_TS;
 			break;
+		case DRM_I915_PERF_PROP_SAMPLE_CLOCK_MONOTONIC:
+			props->sample_flags |= SAMPLE_CLK_MONO;
+			break;
 		case DRM_I915_PERF_PROP_MAX:
 			BUG();
 		}
@@ -3072,8 +3253,11 @@ void i915_perf_init(struct drm_device *dev)
 	mutex_init(&dev_priv->perf.streams_lock);
 	spin_lock_init(&dev_priv->perf.hook_lock);
 
-	dev_priv->perf.oa.timestamp_frequency =
+	dev_priv->perf.timestamp_frequency =
 				GT_CS_TIMESTAMP_FREQUENCY(dev_priv);
+	dev_priv->perf.resync_period = INIT_CLK_SYNC_PERIOD;
+	INIT_DELAYED_WORK(&dev_priv->perf.clk_sync_work,
+			i915_perf_clock_sync_work);
 
 	if (IS_HASWELL(dev)) {
 		dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 2584c0b..4f1b987 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3288,6 +3288,16 @@ enum skl_disp_power_wells {
 				INTERVAL_1_33_US(us)) : \
 				INTERVAL_1_28_US(us))
 
+/* GT CS timestamp frequency */
+#define FREQUENCY_12_5_MHZ	(12500000)
+#define FREQUENCY_12_0_MHZ	(12000000)
+#define FREQUENCY_19_2_MHZ	(19200000)
+#define GT_CS_TIMESTAMP_FREQUENCY(dev_priv) (IS_GEN9(dev_priv) ? \
+				(IS_BROXTON(dev_priv) ? \
+				FREQUENCY_19_2_MHZ : \
+				FREQUENCY_12_0_MHZ) : \
+				FREQUENCY_12_5_MHZ)
+
 /*
  * Logical Context regs
  */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 072ff08..a564a05 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1278,6 +1278,12 @@ enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_SAMPLE_TS,
 
+	/**
+	 * This property requests inclusion of CLOCK_MONOTONIC system time in
+	 * the perf sample data.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_CLOCK_MONOTONIC,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
@@ -1346,7 +1352,8 @@ enum drm_i915_perf_record_type {
 	 *     { u32 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID
 	 *     { u32 pid; } && DRM_I915_PERF_PROP_SAMPLE_PID
 	 *     { u32 tag; } && DRM_I915_PERF_PROP_SAMPLE_TAG
-	 *     { u64 timestamp; } && DRM_I915_PERF_PROP_SAMPLE_TS
+	 *     { u64 gpu_ts; } && DRM_I915_PERF_PROP_SAMPLE_TS
+	 *     { u64 clk_mono; } && DRM_I915_PERF_PROP_SAMPLE_CLOCK_MONOTONIC
 	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
 	 * };
 	 */
-- 
1.9.1



More information about the Intel-gfx mailing list