[Intel-gfx] [PATCH 4/8] drm/i915: flush periodic samples, in case of no pending CS sample requests

sourab.gupta at intel.com sourab.gupta at intel.com
Thu Mar 16 06:14:11 UTC 2017


From: Sourab Gupta <sourab.gupta at intel.com>

When there are no pending CS OA samples, flush the periodic OA samples
collected so far.

We can safely forward the periodic OA samples in the case we
have no pending CS samples, but we can't do so in the case we have
pending CS samples, since we don't know what the ordering between
pending CS samples and periodic samples will eventually be. If we
have no pending CS sample, it won't be possible for future pending CS
sample to have timestamps earlier than current periodic timestamp.

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |  10 ++-
 drivers/gpu/drm/i915/i915_perf.c | 175 +++++++++++++++++++++++++++++----------
 2 files changed, 139 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0b70052..d0f43e9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2110,10 +2110,11 @@ struct i915_oa_ops {
 		    char __user *buf,
 		    size_t count,
 		    size_t *offset,
-		    u32 ts);
+		    u32 ts,
+		    u32 max_reports);
 
 	/**
-	 * @oa_buffer_check: Check for OA buffer data + update tail
+	 * @oa_buffer_num_reports: Return number of OA reports + update tail
 	 *
 	 * This is either called via fops or the poll check hrtimer (atomic
 	 * ctx) without any locks taken.
@@ -2126,7 +2127,8 @@ struct i915_oa_ops {
 	 * here, which will be handled gracefully - likely resulting in an
 	 * %EAGAIN error for userspace.
 	 */
-	bool (*oa_buffer_check)(struct drm_i915_private *dev_priv);
+	u32 (*oa_buffer_num_reports)(struct drm_i915_private *dev_priv,
+					u32 *last_ts);
 };
 
 /*
@@ -2589,6 +2591,8 @@ struct drm_i915_private {
 			u32 gen7_latched_oastatus1;
 			u32 ctx_oactxctrl_off;
 			u32 ctx_flexeu0_off;
+			u32 n_pending_periodic_samples;
+			u32 pending_periodic_ts;
 
 			/* The RPT_ID/reason field for Gen8+ includes a bit
 			 * to determine if the CTX ID in the report is valid
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 3321dad..d1d9853 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -583,7 +583,7 @@ static void i915_oa_rcs_release_samples(struct drm_i915_private *dev_priv)
 }
 
 /**
- * gen8_oa_buffer_check_unlocked - check for data and update tail ptr state
+ * gen8_oa_buffer_num_reports_unlocked - check for data and update tail ptr state
  * @dev_priv: i915 device instance
  *
  * This is either called via fops (for blocking reads in user ctx) or the poll
@@ -596,7 +596,7 @@ static void i915_oa_rcs_release_samples(struct drm_i915_private *dev_priv)
  * the pointers time to 'age' before they are made available for reading.
  * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
  *
- * Besides returning true when there is data available to read() this function
+ * Besides returning num of reports when there is data available to read() it
  * also has the side effect of updating the oa_buffer.tails[], .aging_timestamp
  * and .aged_tail_idx state used for reading.
  *
@@ -604,14 +604,15 @@ static void i915_oa_rcs_release_samples(struct drm_i915_private *dev_priv)
  * only called while the stream is enabled, while the global OA configuration
  * can't be modified.
  *
- * Returns: %true if the OA buffer contains data, else %false
+ * Returns: number of samples available to read
  */
-static bool gen8_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
+static u32 gen8_oa_buffer_num_reports_unlocked(
+			struct drm_i915_private *dev_priv, u32 *last_ts)
 {
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 	unsigned long flags;
 	unsigned int aged_idx;
-	u32 head, hw_tail, aged_tail, aging_tail;
+	u32 head, hw_tail, aged_tail, aging_tail, num_reports = 0;
 	u64 now;
 
 	/* We have to consider the (unlikely) possibility that read() errors
@@ -652,6 +653,13 @@ static bool gen8_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 	if (aging_tail != INVALID_TAIL_PTR &&
 	    ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
 	     OA_TAIL_MARGIN_NSEC)) {
+		u32 mask = (OA_BUFFER_SIZE - 1);
+		u32 gtt_offset = i915_ggtt_offset(
+				dev_priv->perf.oa.oa_buffer.vma);
+		u32 head = (dev_priv->perf.oa.oa_buffer.head - gtt_offset)
+				& mask;
+		u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
+		u32 *report32;
 
 		aged_idx ^= 1;
 		dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
@@ -661,6 +669,14 @@ static bool gen8_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 		/* Mark that we need a new pointer to start aging... */
 		dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset = INVALID_TAIL_PTR;
 		aging_tail = INVALID_TAIL_PTR;
+
+		num_reports = OA_TAKEN(((aged_tail - gtt_offset) & mask), head)/
+				report_size;
+
+		/* read the timestamp of last OA report */
+		head = (head + report_size*(num_reports - 1)) & mask;
+		report32 = (u32 *)(oa_buf_base + head);
+		*last_ts = report32[1];
 	}
 
 	/* Update the aging tail
@@ -694,12 +710,11 @@ static bool gen8_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 
 	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 
-	return aged_tail == INVALID_TAIL_PTR ?
-		false : OA_TAKEN(aged_tail, head) >= report_size;
+	return aged_tail == INVALID_TAIL_PTR ? 0 : num_reports;
 }
 
 /**
- * gen7_oa_buffer_check_unlocked - check for data and update tail ptr state
+ * gen7_oa_buffer_num_reports_unlocked - check for data and update tail ptr state
  * @dev_priv: i915 device instance
  *
  * This is either called via fops (for blocking reads in user ctx) or the poll
@@ -712,7 +727,7 @@ static bool gen8_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
  * the pointers time to 'age' before they are made available for reading.
  * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
  *
- * Besides returning true when there is data available to read() this function
+ * Besides returning num of reports when there is data available to read() it
  * also has the side effect of updating the oa_buffer.tails[], .aging_timestamp
  * and .aged_tail_idx state used for reading.
  *
@@ -720,15 +735,16 @@ static bool gen8_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
  * only called while the stream is enabled, while the global OA configuration
  * can't be modified.
  *
- * Returns: %true if the OA buffer contains data, else %false
+ * Returns: number of samples available to read
  */
-static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
+static u32 gen7_oa_buffer_num_reports_unlocked(
+			struct drm_i915_private *dev_priv, u32 *last_ts)
 {
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 	unsigned long flags;
 	unsigned int aged_idx;
 	u32 oastatus1;
-	u32 head, hw_tail, aged_tail, aging_tail;
+	u32 head, hw_tail, aged_tail, aging_tail, num_reports = 0;
 	u64 now;
 
 	/* We have to consider the (unlikely) possibility that read() errors
@@ -770,6 +786,14 @@ static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 	if (aging_tail != INVALID_TAIL_PTR &&
 	    ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
 	     OA_TAIL_MARGIN_NSEC)) {
+		u32 mask = (OA_BUFFER_SIZE - 1);
+		u32 gtt_offset = i915_ggtt_offset(
+				dev_priv->perf.oa.oa_buffer.vma);
+		u32 head = (dev_priv->perf.oa.oa_buffer.head - gtt_offset)
+				& mask;
+		u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
+		u32 *report32;
+
 		aged_idx ^= 1;
 		dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
 
@@ -778,6 +802,14 @@ static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 		/* Mark that we need a new pointer to start aging... */
 		dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset = INVALID_TAIL_PTR;
 		aging_tail = INVALID_TAIL_PTR;
+
+		num_reports = OA_TAKEN(((aged_tail - gtt_offset) & mask), head)/
+				report_size;
+
+		/* read the timestamp of last OA report */
+		head = (head + report_size*(num_reports - 1)) & mask;
+		report32 = (u32 *)(oa_buf_base + head);
+		*last_ts = report32[1];
 	}
 
 	/* Update the aging tail
@@ -811,8 +843,7 @@ static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 
 	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 
-	return aged_tail == INVALID_TAIL_PTR ?
-		false : OA_TAKEN(aged_tail, head) >= report_size;
+	return aged_tail == INVALID_TAIL_PTR ? 0 : num_reports;
 }
 
 /**
@@ -955,6 +986,7 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream,
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
  * @ts: copy OA reports till this timestamp
+ * @max_reports: max number of OA reports to copy
  *
  * Notably any error condition resulting in a short read (-%ENOSPC or
  * -%EFAULT) will be returned even though one or more records may
@@ -973,7 +1005,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				  char __user *buf,
 				  size_t count,
 				  size_t *offset,
-				  u32 ts)
+				  u32 ts,
+				  u32 max_reports)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -986,6 +1019,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	u32 head, tail;
 	u32 taken;
 	int ret = 0;
+	u32 report_count = 0;
 
 	if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
 		return -EIO;
@@ -1024,7 +1058,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 
 
 	for (/* none */;
-	     (taken = OA_TAKEN(tail, head));
+	     (taken = OA_TAKEN(tail, head)) && (report_count <= max_reports);
 	     head = (head + report_size) & mask) {
 		u8 *report = oa_buf_base + head;
 		u32 *report32 = (void *)report;
@@ -1127,6 +1161,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 			if (ret)
 				break;
 
+			report_count++;
 			dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
 		}
 
@@ -1163,6 +1198,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
  * @ts: copy OA reports till this timestamp
+ * @max_reports: max number of OA reports to copy
  *
  * Checks OA unit status registers and if necessary appends corresponding
  * status records for userspace (such as for a buffer full condition) and then
@@ -1181,7 +1217,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
 			size_t *offset,
-			u32 ts)
+			u32 ts,
+			u32 max_reports)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 oastatus;
@@ -1231,7 +1268,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 		}
 	}
 
-	return gen8_append_oa_reports(stream, buf, count, offset, ts);
+	return gen8_append_oa_reports(stream, buf, count, offset, ts,
+					max_reports);
 }
 
 /**
@@ -1241,6 +1279,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
  * @ts: copy OA reports till this timestamp
+ * @max_reports: max number of OA reports to copy
  *
  * Notably any error condition resulting in a short read (-%ENOSPC or
  * -%EFAULT) will be returned even though one or more records may
@@ -1259,7 +1298,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 				  char __user *buf,
 				  size_t count,
 				  size_t *offset,
-				  u32 ts)
+				  u32 ts,
+				  u32 max_reports)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -1272,6 +1312,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	u32 head, tail;
 	u32 taken;
 	int ret = 0;
+	u32 report_count = 0;
 
 	if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
 		return -EIO;
@@ -1310,7 +1351,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 
 
 	for (/* none */;
-	     (taken = OA_TAKEN(tail, head));
+	     (taken = OA_TAKEN(tail, head)) && (report_count <= max_reports);
 	     head = (head + report_size) & mask) {
 		u8 *report = oa_buf_base + head;
 		u32 *report32 = (void *)report;
@@ -1348,6 +1389,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 		if (ret)
 			break;
 
+		report_count++;
 		/* The above report-id field sanity check is based on
 		 * the assumption that the OA buffer is initially
 		 * zeroed and we reset the field after copying so the
@@ -1384,6 +1426,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
  * @ts: copy OA reports till this timestamp
+ * @max_reports: max number of OA reports to copy
  *
  * Checks Gen 7 specific OA unit status registers and if necessary appends
  * corresponding status records for userspace (such as for a buffer full
@@ -1398,7 +1441,8 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
 			size_t *offset,
-			u32 ts)
+			u32 ts,
+			u32 max_reports)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 oastatus1;
@@ -1460,7 +1504,8 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 			GEN7_OASTATUS1_REPORT_LOST;
 	}
 
-	return gen7_append_oa_reports(stream, buf, count, offset, ts);
+	return gen7_append_oa_reports(stream, buf, count, offset, ts,
+					max_reports);
 }
 
 /**
@@ -1490,7 +1535,8 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream,
 
 	/* First, append the periodic OA samples having lower timestamps */
 	report_ts = *(u32 *)(report + 4);
-	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset, report_ts);
+	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+					report_ts, U32_MAX);
 	if (ret)
 		return ret;
 
@@ -1535,7 +1581,7 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	spin_lock(&dev_priv->perf.sample_lock);
 	if (list_empty(&dev_priv->perf.cs_samples)) {
 		spin_unlock(&dev_priv->perf.sample_lock);
-		return 0;
+		goto pending_periodic;
 	}
 	list_for_each_entry_safe(entry, next,
 				 &dev_priv->perf.cs_samples, link) {
@@ -1546,7 +1592,7 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	spin_unlock(&dev_priv->perf.sample_lock);
 
 	if (list_empty(&free_list))
-		return 0;
+		goto pending_periodic;
 
 	list_for_each_entry_safe(entry, next, &free_list, link) {
 		ret = append_oa_rcs_sample(stream, buf, count, offset, entry);
@@ -1564,18 +1610,37 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	spin_unlock(&dev_priv->perf.sample_lock);
 
 	return ret;
+
+pending_periodic:
+	if (!dev_priv->perf.oa.n_pending_periodic_samples)
+		return 0;
+
+	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+				dev_priv->perf.oa.pending_periodic_ts,
+				dev_priv->perf.oa.n_pending_periodic_samples);
+	dev_priv->perf.oa.n_pending_periodic_samples = 0;
+	dev_priv->perf.oa.pending_periodic_ts = 0;
+	return ret;
 }
 
+enum cs_buf_state {
+	CS_BUF_EMPTY,
+	CS_BUF_REQ_PENDING,
+	CS_BUF_HAVE_DATA,
+};
+
 /*
- * command_stream_buf_is_empty - Checks whether the command stream buffer
+ * command_stream_buf_state - Checks whether the command stream buffer
  * associated with the stream has data available.
  * @stream: An i915-perf stream opened for OA metrics
  *
- * Returns: true if atleast one request associated with command stream is
- * completed, else returns false.
+ * Returns:
+ * CS_BUF_HAVE_DATA	- if there is atleast one completed request
+ * CS_BUF_REQ_PENDING	- there are requests pending, but no completed requests
+ * CS_BUF_EMPTY		- no requests scheduled
  */
-static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
-
+static enum cs_buf_state command_stream_buf_state(
+				struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	struct i915_perf_cs_sample *entry = NULL;
@@ -1589,30 +1654,54 @@ static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
 	spin_unlock(&dev_priv->perf.sample_lock);
 
 	if (!entry)
-		return true;
+		return CS_BUF_EMPTY;
 	else if (!i915_gem_request_completed(request))
-		return true;
+		return CS_BUF_REQ_PENDING;
 	else
-		return false;
+		return CS_BUF_HAVE_DATA;
 }
 
 /**
  * stream_have_data_unlocked - Checks whether the stream has data available
  * @stream: An i915-perf stream opened for OA metrics
  *
- * For command stream based streams, check if the command stream buffer has
- * atleast one sample available, if not return false, irrespective of periodic
- * oa buffer having the data or not.
+ * Note: We can safely forward the periodic OA samples in the case we have no
+ * pending CS samples, but we can't do so in the case we have pending CS
+ * samples, since we don't know what the ordering between pending CS samples
+ * and periodic samples will eventually be. If we have no pending CS sample,
+ * it won't be possible for future pending CS sample to have timestamps
+ * earlier than current periodic timestamp.
  */
 
 static bool stream_have_data_unlocked(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	enum cs_buf_state state;
+	u32 num_samples, last_ts = 0;
 
+	dev_priv->perf.oa.n_pending_periodic_samples = 0;
+	dev_priv->perf.oa.pending_periodic_ts = 0;
+	num_samples = dev_priv->perf.oa.ops.oa_buffer_num_reports(dev_priv,
+								&last_ts);
 	if (stream->cs_mode)
-		return !command_stream_buf_is_empty(stream);
+		state = command_stream_buf_state(stream);
 	else
-		return dev_priv->perf.oa.ops.oa_buffer_check(dev_priv);
+		state = CS_BUF_EMPTY;
+
+	switch (state) {
+	case CS_BUF_EMPTY:
+		dev_priv->perf.oa.n_pending_periodic_samples = num_samples;
+		dev_priv->perf.oa.pending_periodic_ts = last_ts;
+		return (num_samples != 0);
+
+	case CS_BUF_HAVE_DATA:
+		return true;
+
+	case CS_BUF_REQ_PENDING:
+		default:
+		return false;
+	}
+	return false;
 }
 
 /**
@@ -1698,7 +1787,7 @@ static int i915_oa_read(struct i915_perf_stream *stream,
 		return oa_rcs_append_reports(stream, buf, count, offset);
 	else
 		return dev_priv->perf.oa.ops.read(stream, buf, count, offset,
-						U32_MAX);
+						U32_MAX, U32_MAX);
 }
 
 /**
@@ -3688,8 +3777,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
 		dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
 		dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
 		dev_priv->perf.oa.ops.read = gen7_oa_read;
-		dev_priv->perf.oa.ops.oa_buffer_check =
-			gen7_oa_buffer_check_unlocked;
+		dev_priv->perf.oa.ops.oa_buffer_num_reports =
+			gen7_oa_buffer_num_reports_unlocked;
 
 		dev_priv->perf.oa.oa_formats = hsw_oa_formats;
 
@@ -3753,8 +3842,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
 			dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable;
 			dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable;
 			dev_priv->perf.oa.ops.read = gen8_oa_read;
-			dev_priv->perf.oa.ops.oa_buffer_check =
-				gen8_oa_buffer_check_unlocked;
+			dev_priv->perf.oa.ops.oa_buffer_num_reports =
+				gen8_oa_buffer_num_reports_unlocked;
 
 			dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats;
 		}
-- 
1.9.1



More information about the Intel-gfx mailing list