[PATCH] drm/i915/oa: rmb ftw

Chris Wilson chris at chris-wilson.co.uk
Tue Feb 19 01:22:42 UTC 2019


---
 drivers/gpu/drm/i915/i915_drv.h  |  59 ----
 drivers/gpu/drm/i915/i915_perf.c | 493 ++++---------------------------
 2 files changed, 64 insertions(+), 488 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5c8d0489a1cd..773c205e1bbe 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1881,12 +1881,6 @@ struct drm_i915_private {
 			wait_queue_head_t poll_wq;
 			bool pollin;
 
-			/**
-			 * For rate limiting any notifications of spurious
-			 * invalid OA reports
-			 */
-			struct ratelimit_state spurious_report_rs;
-
 			bool periodic;
 			int period_exponent;
 
@@ -1899,59 +1893,6 @@ struct drm_i915_private {
 				int format;
 				int format_size;
 
-				/**
-				 * Locks reads and writes to all head/tail state
-				 *
-				 * Consider: the head and tail pointer state
-				 * needs to be read consistently from a hrtimer
-				 * callback (atomic context) and read() fop
-				 * (user context) with tail pointer updates
-				 * happening in atomic context and head updates
-				 * in user context and the (unlikely)
-				 * possibility of read() errors needing to
-				 * reset all head/tail state.
-				 *
-				 * Note: Contention or performance aren't
-				 * currently a significant concern here
-				 * considering the relatively low frequency of
-				 * hrtimer callbacks (5ms period) and that
-				 * reads typically only happen in response to a
-				 * hrtimer event and likely complete before the
-				 * next callback.
-				 *
-				 * Note: This lock is not held *while* reading
-				 * and copying data to userspace so the value
-				 * of head observed in htrimer callbacks won't
-				 * represent any partial consumption of data.
-				 */
-				spinlock_t ptr_lock;
-
-				/**
-				 * One 'aging' tail pointer and one 'aged'
-				 * tail pointer ready to used for reading.
-				 *
-				 * Initial values of 0xffffffff are invalid
-				 * and imply that an update is required
-				 * (and should be ignored by an attempted
-				 * read)
-				 */
-				struct {
-					u32 offset;
-				} tails[2];
-
-				/**
-				 * Index for the aged tail ready to read()
-				 * data up to.
-				 */
-				unsigned int aged_tail_idx;
-
-				/**
-				 * A monotonic timestamp for when the current
-				 * aging tail pointer was read; used to
-				 * determine when it is old enough to trust.
-				 */
-				u64 aging_timestamp;
-
 				/**
 				 * Although we can always read back the head
 				 * pointer register, we prefer to avoid
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 9ebf99f3d8d3..fe2b9dc83955 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -217,53 +217,8 @@
  * that the overflow cases are unlikely in normal operation.
  */
 #define OA_BUFFER_SIZE		SZ_16M
-
 #define OA_TAKEN(tail, head)	((tail - head) & (OA_BUFFER_SIZE - 1))
 
-/**
- * DOC: OA Tail Pointer Race
- *
- * There's a HW race condition between OA unit tail pointer register updates and
- * writes to memory whereby the tail pointer can sometimes get ahead of what's
- * been written out to the OA buffer so far (in terms of what's visible to the
- * CPU).
- *
- * Although this can be observed explicitly while copying reports to userspace
- * by checking for a zeroed report-id field in tail reports, we want to account
- * for this earlier, as part of the oa_buffer_check to avoid lots of redundant
- * read() attempts.
- *
- * In effect we define a tail pointer for reading that lags the real tail
- * pointer by at least %OA_TAIL_MARGIN_NSEC nanoseconds, which gives enough
- * time for the corresponding reports to become visible to the CPU.
- *
- * To manage this we actually track two tail pointers:
- *  1) An 'aging' tail with an associated timestamp that is tracked until we
- *     can trust the corresponding data is visible to the CPU; at which point
- *     it is considered 'aged'.
- *  2) An 'aged' tail that can be used for read()ing.
- *
- * The two separate pointers let us decouple read()s from tail pointer aging.
- *
- * The tail pointers are checked and updated at a limited rate within a hrtimer
- * callback (the same callback that is used for delivering EPOLLIN events)
- *
- * Initially the tails are marked invalid with %INVALID_TAIL_PTR which
- * indicates that an updated tail pointer is needed.
- *
- * Most of the implementation details for this workaround are in
- * oa_buffer_check_unlocked() and _append_oa_reports()
- *
- * Note for posterity: previously the driver used to define an effective tail
- * pointer that lagged the real pointer by a 'tail margin' measured in bytes
- * derived from %OA_TAIL_MARGIN_NSEC and the configured sampling frequency.
- * This was flawed considering that the OA unit may also automatically generate
- * non-periodic reports (such as on context switch) or the OA unit may be
- * enabled without any periodic sampling.
- */
-#define OA_TAIL_MARGIN_NSEC	100000ULL
-#define INVALID_TAIL_PTR	0xffffffff
-
 /* frequency for checking whether the OA unit has written new reports to the
  * circular OA buffer...
  */
@@ -413,14 +368,12 @@ static int get_oa_config(struct drm_i915_private *dev_priv,
 
 static u32 gen8_oa_hw_tail_read(struct drm_i915_private *dev_priv)
 {
-	return I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
+	return I915_READ_FW(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
 }
 
 static u32 gen7_oa_hw_tail_read(struct drm_i915_private *dev_priv)
 {
-	u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
-
-	return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
+	return I915_READ_FW(GEN7_OASTATUS1) & GEN7_OASTATUS1_TAIL_MASK;
 }
 
 /**
@@ -431,112 +384,17 @@ static u32 gen7_oa_hw_tail_read(struct drm_i915_private *dev_priv)
  * check hrtimer (atomic ctx) to check the OA buffer tail pointer and check
  * if there is data available for userspace to read.
  *
- * This function is central to providing a workaround for the OA unit tail
- * pointer having a race with respect to what data is visible to the CPU.
- * It is responsible for reading tail pointers from the hardware and giving
- * the pointers time to 'age' before they are made available for reading.
- * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
- *
- * Besides returning true when there is data available to read() this function
- * also has the side effect of updating the oa_buffer.tails[], .aging_timestamp
- * and .aged_tail_idx state used for reading.
- *
- * Note: It's safe to read OA config state here unlocked, assuming that this is
- * only called while the stream is enabled, while the global OA configuration
- * can't be modified.
- *
  * Returns: %true if the OA buffer contains data, else %false
  */
 static bool oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 {
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
-	unsigned long flags;
-	unsigned int aged_idx;
-	u32 head, hw_tail, aged_tail, aging_tail;
-	u64 now;
-
-	/* We have to consider the (unlikely) possibility that read() errors
-	 * could result in an OA buffer reset which might reset the head,
-	 * tails[] and aged_tail state.
-	 */
-	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-
-	/* NB: The head we observe here might effectively be a little out of
-	 * date (between head and tails[aged_idx].offset if there is currently
-	 * a read() in progress.
-	 */
-	head = dev_priv->perf.oa.oa_buffer.head;
-
-	aged_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
-	aged_tail = dev_priv->perf.oa.oa_buffer.tails[aged_idx].offset;
-	aging_tail = dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset;
-
-	hw_tail = dev_priv->perf.oa.ops.oa_hw_tail_read(dev_priv);
-
-	/* The tail pointer increases in 64 byte increments,
-	 * not in report_size steps...
-	 */
-	hw_tail &= ~(report_size - 1);
-
-	now = ktime_get_mono_fast_ns();
-
-	/* Update the aged tail
-	 *
-	 * Flip the tail pointer available for read()s once the aging tail is
-	 * old enough to trust that the corresponding data will be visible to
-	 * the CPU...
-	 *
-	 * Do this before updating the aging pointer in case we may be able to
-	 * immediately start aging a new pointer too (if new data has become
-	 * available) without needing to wait for a later hrtimer callback.
-	 */
-	if (aging_tail != INVALID_TAIL_PTR &&
-	    ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
-	     OA_TAIL_MARGIN_NSEC)) {
-
-		aged_idx ^= 1;
-		dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
-
-		aged_tail = aging_tail;
-
-		/* Mark that we need a new pointer to start aging... */
-		dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset = INVALID_TAIL_PTR;
-		aging_tail = INVALID_TAIL_PTR;
-	}
-
-	/* Update the aging tail
-	 *
-	 * We throttle aging tail updates until we have a new tail that
-	 * represents >= one report more data than is already available for
-	 * reading. This ensures there will be enough data for a successful
-	 * read once this new pointer has aged and ensures we will give the new
-	 * pointer time to age.
-	 */
-	if (aging_tail == INVALID_TAIL_PTR &&
-	    (aged_tail == INVALID_TAIL_PTR ||
-	     OA_TAKEN(hw_tail, aged_tail) >= report_size)) {
-		struct i915_vma *vma = dev_priv->perf.oa.oa_buffer.vma;
-		u32 gtt_offset = i915_ggtt_offset(vma);
-
-		/* Be paranoid and do a bounds check on the pointer read back
-		 * from hardware, just in case some spurious hardware condition
-		 * could put the tail out of bounds...
-		 */
-		if (hw_tail >= gtt_offset &&
-		    hw_tail < (gtt_offset + OA_BUFFER_SIZE)) {
-			dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset =
-				aging_tail = hw_tail;
-			dev_priv->perf.oa.oa_buffer.aging_timestamp = now;
-		} else {
-			DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %u\n",
-				  hw_tail);
-		}
-	}
+	u32 head, tail;
 
-	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+	head = READ_ONCE(dev_priv->perf.oa.oa_buffer.head);
+	tail = dev_priv->perf.oa.ops.oa_hw_tail_read(dev_priv);
 
-	return aged_tail == INVALID_TAIL_PTR ?
-		false : OA_TAKEN(aged_tail, head) >= report_size;
+	return OA_TAKEN(tail, head) >= report_size;
 }
 
 /**
@@ -594,18 +452,15 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 			    char __user *buf,
 			    size_t count,
 			    size_t *offset,
-			    const u8 *report)
+			    const void *report,
+			    int report_size)
 {
-	struct drm_i915_private *dev_priv = stream->dev_priv;
-	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
-	struct drm_i915_perf_record_header header;
-	u32 sample_flags = stream->sample_flags;
-
-	header.type = DRM_I915_PERF_RECORD_SAMPLE;
-	header.pad = 0;
-	header.size = stream->sample_size;
+	struct drm_i915_perf_record_header header = {
+		.type = DRM_I915_PERF_RECORD_SAMPLE,
+		.size = stream->sample_size,
+	};
 
-	if ((count - *offset) < header.size)
+	if (count - *offset < header.size)
 		return -ENOSPC;
 
 	buf += *offset;
@@ -613,13 +468,11 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 		return -EFAULT;
 	buf += sizeof(header);
 
-	if (sample_flags & SAMPLE_OA_REPORT) {
-		if (copy_to_user(buf, report, report_size))
-			return -EFAULT;
-	}
-
-	(*offset) += header.size;
+	if (stream->sample_flags & SAMPLE_OA_REPORT &&
+	    copy_to_user(buf, report, report_size))
+		return -EFAULT;
 
+	*offset += header.size;
 	return 0;
 }
 
@@ -649,77 +502,24 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				  size_t *offset)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	const void *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
-	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
-	u32 mask = (OA_BUFFER_SIZE - 1);
-	size_t start_offset = *offset;
-	unsigned long flags;
-	unsigned int aged_tail_idx;
+	u32 report[64] __attribute__((aligned(16)));
 	u32 head, tail;
-	u32 taken;
 	int ret = 0;
 
 	if (WARN_ON(!stream->enabled))
 		return -EIO;
 
-	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-
-	head = dev_priv->perf.oa.oa_buffer.head;
-	aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
-	tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
-
-	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-
-	/*
-	 * An invalid tail pointer here means we're still waiting for the poll
-	 * hrtimer callback to give us a pointer
-	 */
-	if (tail == INVALID_TAIL_PTR)
-		return -EAGAIN;
-
-	/*
-	 * NB: oa_buffer.head/tail include the gtt_offset which we don't want
-	 * while indexing relative to oa_buf_base.
-	 */
-	head -= gtt_offset;
-	tail -= gtt_offset;
-
-	/*
-	 * An out of bounds or misaligned head or tail pointer implies a driver
-	 * bug since we validate + align the tail pointers we read from the
-	 * hardware and we are in full control of the head pointer which should
-	 * only be incremented by multiples of the report size (notably also
-	 * all a power of two).
-	 */
-	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
-		      tail > OA_BUFFER_SIZE || tail % report_size,
-		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
-		      head, tail))
-		return -EIO;
-
-
-	for (/* none */;
-	     (taken = OA_TAKEN(tail, head));
-	     head = (head + report_size) & mask) {
-		u8 *report = oa_buf_base + head;
-		u32 *report32 = (void *)report;
+	for (head = READ_ONCE(dev_priv->perf.oa.oa_buffer.head) - gtt_offset,
+	     tail = gen8_oa_hw_tail_read(dev_priv) - gtt_offset;
+	     OA_TAKEN(tail, head) >= report_size;
+	     head = (head + report_size) & (OA_BUFFER_SIZE - 1)) {
 		u32 ctx_id;
 		u32 reason;
 
-		/*
-		 * All the report sizes factor neatly into the buffer
-		 * size so we never expect to see a report split
-		 * between the beginning and end of the buffer.
-		 *
-		 * Given the initial alignment check a misalignment
-		 * here would imply a driver bug that would result
-		 * in an overrun.
-		 */
-		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
-			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
-			break;
-		}
+		i915_memcpy_from_wc(report, oa_buf_base + head, report_size);
 
 		/*
 		 * The reason field includes flags identifying what
@@ -730,15 +530,12 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * check that the report isn't invalid before copying
 		 * it to userspace...
 		 */
-		reason = ((report32[0] >> OAREPORT_REASON_SHIFT) &
+		reason = ((report[0] >> OAREPORT_REASON_SHIFT) &
 			  OAREPORT_REASON_MASK);
-		if (reason == 0) {
-			if (__ratelimit(&dev_priv->perf.oa.spurious_report_rs))
-				DRM_NOTE("Skipping spurious, invalid OA report\n");
+		if (reason == 0)
 			continue;
-		}
 
-		ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
+		ctx_id = report[2] & dev_priv->perf.oa.specific_ctx_id_mask;
 
 		/*
 		 * Squash whatever is in the CTX_ID field if it's marked as
@@ -748,8 +545,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * Note: that we don't clear the valid_ctx_bit so userspace can
 		 * understand that the ID has been squashed by the kernel.
 		 */
-		if (!(report32[0] & dev_priv->perf.oa.gen8_valid_ctx_bit))
-			ctx_id = report32[2] = INVALID_CTX_ID;
+		if (!(report[0] & dev_priv->perf.oa.gen8_valid_ctx_bit))
+			ctx_id = report[2] = INVALID_CTX_ID;
 
 		/*
 		 * NB: For Gen 8 the OA unit no longer supports clock gating
@@ -793,42 +590,21 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 			 * leaking the IDs of other contexts.
 			 */
 			if (dev_priv->perf.oa.exclusive_stream->ctx &&
-			    dev_priv->perf.oa.specific_ctx_id != ctx_id) {
-				report32[2] = INVALID_CTX_ID;
-			}
+			    dev_priv->perf.oa.specific_ctx_id != ctx_id)
+				report[2] = INVALID_CTX_ID;
 
 			ret = append_oa_sample(stream, buf, count, offset,
-					       report);
+					       report, report_size);
 			if (ret)
 				break;
 
 			dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
 		}
-
-		/*
-		 * The above reason field sanity check is based on
-		 * the assumption that the OA buffer is initially
-		 * zeroed and we reset the field after copying so the
-		 * check is still meaningful once old reports start
-		 * being overwritten.
-		 */
-		report32[0] = 0;
 	}
 
-	if (start_offset != *offset) {
-		spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-
-		/*
-		 * We removed the gtt_offset for the copy loop above, indexing
-		 * relative to oa_buf_base so put back here...
-		 */
-		head += gtt_offset;
-
-		I915_WRITE(GEN8_OAHEADPTR, head & GEN8_OAHEADPTR_MASK);
-		dev_priv->perf.oa.oa_buffer.head = head;
-
-		spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-	}
+	head += gtt_offset;
+	I915_WRITE_FW(GEN8_OAHEADPTR, head & GEN8_OAHEADPTR_MASK);
+	WRITE_ONCE(dev_priv->perf.oa.oa_buffer.head, head);
 
 	return ret;
 }
@@ -938,112 +714,43 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 				  size_t *offset)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	const void *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
-	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
-	u32 mask = (OA_BUFFER_SIZE - 1);
-	size_t start_offset = *offset;
-	unsigned long flags;
-	unsigned int aged_tail_idx;
+	u32 report[64] __attribute__((aligned(16)));
 	u32 head, tail;
-	u32 taken;
 	int ret = 0;
 
 	if (WARN_ON(!stream->enabled))
 		return -EIO;
 
-	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+	for (head = READ_ONCE(dev_priv->perf.oa.oa_buffer.head) - gtt_offset,
+	     tail = gen7_oa_hw_tail_read(dev_priv) - gtt_offset;
+	     OA_TAKEN(tail, head) >= report_size;
+	     head = (head + report_size) & (OA_BUFFER_SIZE - 1)) {
+		i915_memcpy_from_wc(report, oa_buf_base + head, report_size);
 
-	head = dev_priv->perf.oa.oa_buffer.head;
-	aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
-	tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
-
-	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-
-	/* An invalid tail pointer here means we're still waiting for the poll
-	 * hrtimer callback to give us a pointer
-	 */
-	if (tail == INVALID_TAIL_PTR)
-		return -EAGAIN;
-
-	/* NB: oa_buffer.head/tail include the gtt_offset which we don't want
-	 * while indexing relative to oa_buf_base.
-	 */
-	head -= gtt_offset;
-	tail -= gtt_offset;
-
-	/* An out of bounds or misaligned head or tail pointer implies a driver
-	 * bug since we validate + align the tail pointers we read from the
-	 * hardware and we are in full control of the head pointer which should
-	 * only be incremented by multiples of the report size (notably also
-	 * all a power of two).
-	 */
-	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
-		      tail > OA_BUFFER_SIZE || tail % report_size,
-		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
-		      head, tail))
-		return -EIO;
-
-
-	for (/* none */;
-	     (taken = OA_TAKEN(tail, head));
-	     head = (head + report_size) & mask) {
-		u8 *report = oa_buf_base + head;
-		u32 *report32 = (void *)report;
-
-		/* All the report sizes factor neatly into the buffer
-		 * size so we never expect to see a report split
-		 * between the beginning and end of the buffer.
-		 *
-		 * Given the initial alignment check a misalignment
-		 * here would imply a driver bug that would result
-		 * in an overrun.
-		 */
-		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
-			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
-			break;
-		}
-
-		/* The report-ID field for periodic samples includes
+		/*
+		 * The report-ID field for periodic samples includes
 		 * some undocumented flags related to what triggered
 		 * the report and is never expected to be zero so we
 		 * can check that the report isn't invalid before
 		 * copying it to userspace...
 		 */
-		if (report32[0] == 0) {
-			if (__ratelimit(&dev_priv->perf.oa.spurious_report_rs))
-				DRM_NOTE("Skipping spurious, invalid OA report\n");
+		if (report[0] == 0)
 			continue;
-		}
 
-		ret = append_oa_sample(stream, buf, count, offset, report);
+		ret = append_oa_sample(stream, buf, count, offset,
+				       report, report_size);
 		if (ret)
 			break;
-
-		/* The above report-id field sanity check is based on
-		 * the assumption that the OA buffer is initially
-		 * zeroed and we reset the field after copying so the
-		 * check is still meaningful once old reports start
-		 * being overwritten.
-		 */
-		report32[0] = 0;
 	}
 
-	if (start_offset != *offset) {
-		spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-
-		/* We removed the gtt_offset for the copy loop above, indexing
-		 * relative to oa_buf_base so put back here...
-		 */
-		head += gtt_offset;
-
-		I915_WRITE(GEN7_OASTATUS2,
-			   ((head & GEN7_OASTATUS2_HEAD_MASK) |
-			    GEN7_OASTATUS2_MEM_SELECT_GGTT));
-		dev_priv->perf.oa.oa_buffer.head = head;
-
-		spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-	}
+	head += gtt_offset;
+	I915_WRITE_FW(GEN7_OASTATUS2,
+		      (head & GEN7_OASTATUS2_HEAD_MASK) |
+		      GEN7_OASTATUS2_MEM_SELECT_GGTT);
+	WRITE_ONCE(dev_priv->perf.oa.oa_buffer.head, head);
 
 	return ret;
 }
@@ -1339,12 +1046,10 @@ free_oa_buffer(struct drm_i915_private *i915)
 {
 	mutex_lock(&i915->drm.struct_mutex);
 
-	i915_vma_unpin_and_release(&i915->perf.oa.oa_buffer.vma,
-				   I915_VMA_RELEASE_MAP);
+	i915_vma_unpin_iomap(i915->perf.oa.oa_buffer.vma);
+	i915_vma_unpin_and_release(&i915->perf.oa.oa_buffer.vma, 0);
 
 	mutex_unlock(&i915->drm.struct_mutex);
-
-	i915->perf.oa.oa_buffer.vaddr = NULL;
 }
 
 static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
@@ -1371,19 +1076,11 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 		oa_put_render_ctx_id(stream);
 
 	put_oa_config(dev_priv, stream->oa_config);
-
-	if (dev_priv->perf.oa.spurious_report_rs.missed) {
-		DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
-			 dev_priv->perf.oa.spurious_report_rs.missed);
-	}
 }
 
 static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 {
 	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 
 	/* Pre-DevBDW: OABUFFER must be set with counters off,
 	 * before OASTATUS1, but after OASTATUS2
@@ -1396,31 +1093,12 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 
 	I915_WRITE(GEN7_OASTATUS1, gtt_offset | OABUFFER_SIZE_16M); /* tail */
 
-	/* Mark that we need updated tail pointers to read from... */
-	dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
-	dev_priv->perf.oa.oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
-
-	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-
 	/* On Haswell we have to track which OASTATUS1 flags we've
 	 * already seen since they can't be cleared while periodic
 	 * sampling is enabled.
 	 */
 	dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
 
-	/* NB: although the OA buffer will initially be allocated
-	 * zeroed via shmfs (and so this memset is redundant when
-	 * first allocating), we may re-init the OA buffer, either
-	 * when re-enabling a stream or in error/reset paths.
-	 *
-	 * The reason we clear the buffer for each re-init is for the
-	 * sanity check in gen7_append_oa_reports() that looks at the
-	 * report-id field to make sure it's non-zero which relies on
-	 * the assumption that new reports are being written to zeroed
-	 * memory...
-	 */
-	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
-
 	/* Maybe make ->pollin per-stream state if we support multiple
 	 * concurrent streams in the future.
 	 */
@@ -1430,9 +1108,6 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 {
 	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 
 	I915_WRITE(GEN8_OASTATUS, 0);
 	I915_WRITE(GEN8_OAHEADPTR, gtt_offset);
@@ -1452,10 +1127,6 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 		   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
 	I915_WRITE(GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
 
-	/* Mark that we need updated tail pointers to read from... */
-	dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
-	dev_priv->perf.oa.oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
-
 	/*
 	 * Reset state used to recognise context switches, affecting which
 	 * reports we will forward to userspace while filtering for a single
@@ -1463,22 +1134,6 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 */
 	dev_priv->perf.oa.oa_buffer.last_ctx_id = INVALID_CTX_ID;
 
-	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
-
-	/*
-	 * NB: although the OA buffer will initially be allocated
-	 * zeroed via shmfs (and so this memset is redundant when
-	 * first allocating), we may re-init the OA buffer, either
-	 * when re-enabling a stream or in error/reset paths.
-	 *
-	 * The reason we clear the buffer for each re-init is for the
-	 * sanity check in gen8_append_oa_reports() that looks at the
-	 * reason field to make sure it's non-zero which relies on
-	 * the assumption that new reports are being written to zeroed
-	 * memory...
-	 */
-	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
-
 	/*
 	 * Maybe make ->pollin per-stream state if we support multiple
 	 * concurrent streams in the future.
@@ -1513,16 +1168,19 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 	if (ret)
 		goto err_unref;
 
+	ret = i915_gem_object_set_to_gtt_domain(bo, true);
+	if (ret)
+		goto err_unref;
+
 	/* PreHSW required 512K alignment, HSW requires 16M */
-	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
+	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, PIN_MAPPABLE);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
 		goto err_unref;
 	}
 	dev_priv->perf.oa.oa_buffer.vma = vma;
 
-	dev_priv->perf.oa.oa_buffer.vaddr =
-		i915_gem_object_pin_map(bo, I915_MAP_WB);
+	dev_priv->perf.oa.oa_buffer.vaddr = i915_vma_pin_iomap(vma);
 	if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
 		ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
 		goto err_unpin;
@@ -1540,7 +1198,6 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 err_unref:
 	i915_gem_object_put(bo);
 
-	dev_priv->perf.oa.oa_buffer.vaddr = NULL;
 	dev_priv->perf.oa.oa_buffer.vma = NULL;
 
 unlock:
@@ -1865,8 +1522,7 @@ static void gen7_oa_enable(struct i915_perf_stream *stream)
 
 	I915_WRITE(GEN7_OACONTROL,
 		   (ctx_id & GEN7_OACONTROL_CTX_MASK) |
-		   (period_exponent <<
-		    GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
+		   (period_exponent << GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
 		   (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
 		   (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
 		   (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
@@ -1894,9 +1550,9 @@ static void gen8_oa_enable(struct i915_perf_stream *stream)
 	 * filtering and instead filter on the cpu based on the context-id
 	 * field of reports
 	 */
-	I915_WRITE(GEN8_OACONTROL, (report_format <<
-				    GEN8_OA_REPORT_FORMAT_SHIFT) |
-				   GEN8_OA_COUNTER_ENABLE);
+	I915_WRITE(GEN8_OACONTROL,
+		   (report_format << GEN8_OA_REPORT_FORMAT_SHIFT) |
+		   GEN8_OA_COUNTER_ENABLE);
 }
 
 /**
@@ -2028,26 +1684,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		return -EINVAL;
 	}
 
-	/* We set up some ratelimit state to potentially throttle any _NOTES
-	 * about spurious, invalid OA reports which we don't forward to
-	 * userspace.
-	 *
-	 * The initialization is associated with opening the stream (not driver
-	 * init) considering we print a _NOTE about any throttling when closing
-	 * the stream instead of waiting until driver _fini which no one would
-	 * ever see.
-	 *
-	 * Using the same limiting factors as printk_ratelimit()
-	 */
-	ratelimit_state_init(&dev_priv->perf.oa.spurious_report_rs,
-			     5 * HZ, 10);
-	/* Since we use a DRM_NOTE for spurious reports it would be
-	 * inconsistent to let __ratelimit() automatically print a warning for
-	 * throttling.
-	 */
-	ratelimit_set_flags(&dev_priv->perf.oa.spurious_report_rs,
-			    RATELIMIT_MSG_ON_RELEASE);
-
 	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
 
 	format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
@@ -3473,7 +3109,6 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
 
 		INIT_LIST_HEAD(&dev_priv->perf.streams);
 		mutex_init(&dev_priv->perf.lock);
-		spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
 
 		oa_sample_rate_hard_limit = 1000 *
 			(RUNTIME_INFO(dev_priv)->cs_timestamp_frequency_khz / 2);
-- 
2.20.1



More information about the Intel-gfx-trybot mailing list