[Intel-gfx] [PATCH 04/15] drm/i915: flush periodic samples, in case of no pending CS sample requests

sourab.gupta at intel.com sourab.gupta at intel.com
Fri Nov 4 09:30:33 UTC 2016


From: Sourab Gupta <sourab.gupta at intel.com>

When there are no pending CS OA samples, flush the periodic OA samples
collected so far.

We can safely forward the periodic OA samples in the case we
have no pending CS samples, but we can't do so in the case we have
pending CS samples, since we don't know what the ordering between
pending CS samples and periodic samples will eventually be. If we
have no pending CS sample, it won't be possible for future pending CS
sample to have timestamps earlier than current periodic timestamp.

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |   7 +-
 drivers/gpu/drm/i915/i915_perf.c | 163 +++++++++++++++++++++++++++++----------
 2 files changed, 129 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0561315..dedb7f8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1856,8 +1856,9 @@ struct i915_oa_ops {
 		    char __user *buf,
 		    size_t count,
 		    size_t *offset,
-		    u32 ts);
-	bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
+		    u32 ts, u32 max_records);
+	int (*oa_buffer_num_samples)(struct drm_i915_private *dev_priv,
+					u32 *last_ts);
 };
 
 /*
@@ -2221,6 +2222,8 @@ struct drm_i915_private {
 			u32 gen7_latched_oastatus1;
 			u32 ctx_oactxctrl_off;
 			u32 ctx_flexeu0_off;
+			u32 n_pending_periodic_samples;
+			u32 pending_periodic_ts;
 
 			struct i915_oa_ops ops;
 			const struct i915_oa_format *oa_formats;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 7bbc757..2ee4711 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -540,13 +540,30 @@ static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv)
  * pointers.  A race here could result in a false positive !empty status which
  * is acceptable.
  */
-static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv)
+static int
+gen8_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv,
+					u32 *last_ts)
 {
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
-	u32 head = I915_READ(GEN8_OAHEADPTR);
-	u32 tail = I915_READ(GEN8_OATAILPTR);
+	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr;
+	u32 head = I915_READ(GEN8_OAHEADPTR) & GEN8_OAHEADPTR_MASK;
+	u32 tail = I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
+	u32 mask = (OA_BUFFER_SIZE - 1);
+	u32 num_samples;
+	u8 *report;
+
+	head -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+	tail -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+	num_samples = OA_TAKEN(tail, head) / report_size;
 
-	return OA_TAKEN(tail, head) < report_size;
+	/* read the timestamp of the last sample */
+	if (num_samples) {
+		head += report_size*(num_samples - 1);
+		report = oa_buf_base + (head & mask);
+		*last_ts = *(u32 *)(report + 4);
+	}
+
+	return num_samples;
 }
 
 /* NB: This is either called via fops or the poll check hrtimer (atomic ctx)
@@ -560,16 +577,32 @@ static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_pr
  * pointers.  A race here could result in a false positive !empty status which
  * is acceptable.
  */
-static bool gen7_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv)
+static int
+gen7_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv,
+					u32 *last_ts)
 {
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 	u32 oastatus2 = I915_READ(GEN7_OASTATUS2);
 	u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
 	u32 head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
 	u32 tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
+	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr;
+	u32 mask = (OA_BUFFER_SIZE - 1);
+	int available_size;
+	u32 num_samples = 0;
+	u8 *report;
 
-	return OA_TAKEN(tail, head) <
-		dev_priv->perf.oa.tail_margin + report_size;
+	head -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+	tail -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+	available_size = OA_TAKEN(tail, head) - dev_priv->perf.oa.tail_margin;
+	if (available_size >= report_size) {
+		num_samples = available_size / report_size;
+		head += report_size*(num_samples - 1);
+		report = oa_buf_base + (head & mask);
+		*last_ts = *(u32 *)(report + 4);
+	}
+
+	return num_samples;
 }
 
 /**
@@ -698,7 +731,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				  size_t count,
 				  size_t *offset,
 				  u32 *head_ptr,
-				  u32 tail, u32 ts)
+				  u32 tail, u32 ts, u32 max_records)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -707,6 +740,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	u32 head;
 	u32 taken;
 	int ret = 0;
+	int n_records = 0;
 
 	BUG_ON(stream->state != I915_PERF_STREAM_ENABLED);
 
@@ -733,7 +767,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	tail &= ~(report_size - 1);
 
 	for (/* none */;
-	     (taken = OA_TAKEN(tail, head));
+	     (taken = OA_TAKEN(tail, head)) && (n_records <= max_records);
 	     head = (head + report_size) & mask) {
 		u8 *report = oa_buf_base + head;
 		u32 *report32 = (void *)report;
@@ -815,6 +849,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 			if (ret)
 				break;
 
+			n_records++;
 			dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
 		}
 
@@ -843,7 +878,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 static int gen8_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
-			size_t *offset, u32 ts)
+			size_t *offset, u32 ts, u32 max_records)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -900,7 +935,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 		}
 	}
 
-	ret = gen8_append_oa_reports(stream, buf, count, offset, &head, tail, ts);
+	ret = gen8_append_oa_reports(stream, buf, count, offset, &head, tail,
+					ts, max_records);
 
 	/* All the report sizes are a power of two and the
 	 * head should always be incremented by some multiple
@@ -942,7 +978,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 				  size_t count,
 				  size_t *offset,
 				  u32 *head_ptr,
-				  u32 tail, u32 ts)
+				  u32 tail, u32 ts, u32 max_records)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -952,6 +988,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	u32 head;
 	u32 taken;
 	int ret = 0;
+	int n_records = 0;
 
 	BUG_ON(stream->state != I915_PERF_STREAM_ENABLED);
 
@@ -991,7 +1028,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	tail &= mask;
 
 	for (/* none */;
-	     (taken = OA_TAKEN(tail, head));
+	     (taken = OA_TAKEN(tail, head)) && (n_records <= max_records);
 	     head = (head + report_size) & mask) {
 		u8 *report = oa_buf_base + head;
 		u32 *report32 = (void *)report;
@@ -1025,6 +1062,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 		if (ret)
 			break;
 
+		n_records++;
 		/* The above report-id field sanity check is based on
 		 * the assumption that the OA buffer is initially
 		 * zeroed and we reset the field after copying so the
@@ -1050,7 +1088,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 static int gen7_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
-			size_t *offset, u32 ts)
+			size_t *offset, u32 ts, u32 max_records)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -1123,7 +1161,7 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 	}
 
 	ret = gen7_append_oa_reports(stream, buf, count, offset,
-				     &head, tail, ts);
+				     &head, tail, ts, max_records);
 
 	/* All the report sizes are a power of two and the
 	 * head should always be incremented by some multiple
@@ -1170,7 +1208,8 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream,
 
 	/* First, append the periodic OA samples having lower timestamps */
 	report_ts = *(u32 *)(report + 4);
-	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset, report_ts);
+	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+					report_ts, U32_MAX);
 	if (ret)
 		return ret;
 
@@ -1187,7 +1226,8 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream,
 }
 
 /**
- * Copies all command stream based OA reports into userspace read() buffer.
+ * Copies all OA reports into userspace read() buffer. This includes command
+ * stream as well as periodic OA reports.
  *
  * NB: some data may be successfully copied to the userspace buffer
  * even if an error is returned, and this is reflected in the
@@ -1204,7 +1244,7 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	spin_lock(&dev_priv->perf.node_list_lock);
 	if (list_empty(&dev_priv->perf.node_list)) {
 		spin_unlock(&dev_priv->perf.node_list_lock);
-		return 0;
+		goto pending_periodic;
 	}
 	list_for_each_entry_safe(entry, next,
 				 &dev_priv->perf.node_list, link) {
@@ -1215,7 +1255,7 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	spin_unlock(&dev_priv->perf.node_list_lock);
 
 	if (list_empty(&free_list))
-		return 0;
+		goto pending_periodic;
 
 	list_for_each_entry_safe(entry, next, &free_list, link) {
 		ret = append_oa_rcs_sample(stream, buf, count, offset, entry);
@@ -1233,16 +1273,35 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	spin_unlock(&dev_priv->perf.node_list_lock);
 
 	return ret;
+
+pending_periodic:
+	if (!dev_priv->perf.oa.n_pending_periodic_samples)
+		return 0;
+
+	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+				dev_priv->perf.oa.pending_periodic_ts,
+				dev_priv->perf.oa.n_pending_periodic_samples);
+	dev_priv->perf.oa.n_pending_periodic_samples = 0;
+	dev_priv->perf.oa.pending_periodic_ts = 0;
+	return ret;
 }
 
+enum cs_buf_data_state {
+	CS_BUF_EMPTY,
+	CS_BUF_REQ_PENDING,
+	CS_BUF_HAVE_DATA,
+};
+
 /*
  * Checks whether the command stream buffer associated with the stream has
  * data ready to be forwarded to userspace.
- * Returns true if atleast one request associated with command stream is
- * completed, else returns false.
+ * Value returned:
+ * CS_BUF_HAVE_DATA	- if there is atleast one completed request
+ * CS_BUF_REQ_PENDING	- there are requests pending, but no completed requests
+ * CS_BUF_EMPTY		- no requests scheduled
  */
-static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
-
+static enum cs_buf_data_state command_stream_buf_state(
+				struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	struct i915_perf_cs_data_node *entry = NULL;
@@ -1256,34 +1315,60 @@ static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
 	spin_unlock(&dev_priv->perf.node_list_lock);
 
 	if (!entry)
-		return true;
+		return CS_BUF_EMPTY;
 	else if (!i915_gem_request_completed(request))
-		return true;
+		return CS_BUF_REQ_PENDING;
 	else
-		return false;
+		return CS_BUF_HAVE_DATA;
 }
 
 /*
- * Checks whether the stream has data ready to forward to userspace.
- * For command stream based streams, check if the command stream buffer has
- * atleast one sample ready, if not return false, irrespective of periodic
- * oa buffer having the data or not.
+ * Checks whether the stream has data ready to forward to userspace, by
+ * querying for periodic oa buffer and command stream buffer samples.
  */
 
 static bool stream_have_data__unlocked(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	enum cs_buf_data_state cs_buf_state;
+	u32 num_samples, last_ts = 0;
 
-	/* Note: the oa_buffer_is_empty() condition is ok to run unlocked as it
-	 * just performs mmio reads of the OA buffer head + tail pointers and
+	/* Note: oa_buffer_num_samples() is ok to run unlocked as it just
+	 * performs mmio reads of the OA buffer head + tail pointers and
 	 * it's assumed we're handling some operation that implies the stream
 	 * can't be destroyed until completion (such as a read()) that ensures
 	 * the device + OA buffer can't disappear
 	 */
+	dev_priv->perf.oa.n_pending_periodic_samples = 0;
+	dev_priv->perf.oa.pending_periodic_ts = 0;
+	num_samples = dev_priv->perf.oa.ops.oa_buffer_num_samples(dev_priv,
+								&last_ts);
 	if (stream->cs_mode)
-		return !command_stream_buf_is_empty(stream);
+		cs_buf_state = command_stream_buf_state(stream);
 	else
-		return !dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv);
+		cs_buf_state = CS_BUF_EMPTY;
+
+	/*
+	 * Note: We can safely forward the periodic OA samples in the case we
+	 * have no pending CS samples, but we can't do so in the case we have
+	 * pending CS samples, since we don't know what the ordering between
+	 * pending CS samples and periodic samples will eventually be. If we
+	 * have no pending CS sample, it won't be possible for future pending CS
+	 * sample to have timestamps earlier than current periodic timestamp.
+	 */
+	switch (cs_buf_state) {
+	case CS_BUF_EMPTY:
+		dev_priv->perf.oa.n_pending_periodic_samples = num_samples;
+		dev_priv->perf.oa.pending_periodic_ts = last_ts;
+		return (num_samples != 0);
+
+	case CS_BUF_HAVE_DATA:
+		return true;
+
+	case CS_BUF_REQ_PENDING:
+		default:
+		return false;
+	}
 }
 
 static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
@@ -1325,7 +1410,7 @@ static int i915_oa_read(struct i915_perf_stream *stream,
 		return oa_rcs_append_reports(stream, buf, count, offset);
 	else
 		return dev_priv->perf.oa.ops.read(stream, buf, count, offset,
-						U32_MAX);
+						U32_MAX, U32_MAX);
 }
 
 static void
@@ -2960,8 +3045,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
 		dev_priv->perf.oa.ops.update_hw_ctx_id_locked =
 			gen7_update_hw_ctx_id_locked;
 		dev_priv->perf.oa.ops.read = gen7_oa_read;
-		dev_priv->perf.oa.ops.oa_buffer_is_empty =
-			gen7_oa_buffer_is_empty_fop_unlocked;
+		dev_priv->perf.oa.ops.oa_buffer_num_samples =
+			gen7_oa_buffer_num_samples_fop_unlocked;
 
 		dev_priv->perf.oa.timestamp_frequency = 12500000;
 
@@ -2974,8 +3059,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
 		dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable;
 		dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable;
 		dev_priv->perf.oa.ops.read = gen8_oa_read;
-		dev_priv->perf.oa.ops.oa_buffer_is_empty =
-			gen8_oa_buffer_is_empty_fop_unlocked;
+		dev_priv->perf.oa.ops.oa_buffer_num_samples =
+				gen8_oa_buffer_num_samples_fop_unlocked;
 
 		dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats;
 
-- 
1.9.1



More information about the Intel-gfx mailing list